diff --git a/DEPS b/DEPS index 7ff991dc4104c9..27f725c484a7a2 100644 --- a/DEPS +++ b/DEPS @@ -158,7 +158,7 @@ vars = { # # Note this revision should be updated with # third_party/boringssl/roll_boringssl.py, not roll-dep. - 'boringssl_revision': '7ef4223fb32431529a797c5b8d3bf26ece6c138b', + 'boringssl_revision': 'f109f2087349712d3ac717d15fab48e130618110', # Three lines of non-changing comments so that # the commit queue can handle CLs rolling google-toolbox-for-mac # and whatever else without interference from each other. diff --git a/third_party/boringssl/ios-arm/crypto/fipsmodule/bsaes-armv7.S b/third_party/boringssl/ios-arm/crypto/fipsmodule/bsaes-armv7.S index d997c6cb610a64..6c25fccfb61d38 100644 --- a/third_party/boringssl/ios-arm/crypto/fipsmodule/bsaes-armv7.S +++ b/third_party/boringssl/ios-arm/crypto/fipsmodule/bsaes-armv7.S @@ -1589,1001 +1589,5 @@ Lctr_enc_short_loop: ldmia sp!, {r4,r5,r6,r7,r8, pc} -.globl _bsaes_xts_encrypt -.private_extern _bsaes_xts_encrypt -#ifdef __thumb2__ -.thumb_func _bsaes_xts_encrypt -#endif -.align 4 -_bsaes_xts_encrypt: - mov ip, sp - stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} @ 0x20 - VFP_ABI_PUSH - mov r6, sp @ future r3 - - mov r7, r0 - mov r8, r1 - mov r9, r2 - mov r10, r3 - - sub r0, sp, #0x10 @ 0x10 - bic r0, #0xf @ align at 16 bytes - mov sp, r0 - -#ifdef XTS_CHAIN_TWEAK - ldr r0, [ip] @ pointer to input tweak -#else - @ generate initial tweak - ldr r0, [ip, #4] @ iv[] - mov r1, sp - ldr r2, [ip, #0] @ key2 - bl _AES_encrypt - mov r0,sp @ pointer to initial tweak -#endif - - ldr r1, [r10, #240] @ get # of rounds - mov r3, r6 -#ifndef BSAES_ASM_EXTENDED_KEY - @ allocate the key schedule on the stack - sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key - @ add r12, #96 @ size of bit-sliced key schedule - sub r12, #48 @ place for tweak[9] - - @ populate the key schedule - mov r4, r10 @ pass key - mov r5, r1 @ pass # of rounds - mov sp, r12 - add r12, #0x90 @ pass key schedule - bl _bsaes_key_convert - veor q7, q7, q15 @ fix up last round key - vstmia r12, {q7} @ save last round key -#else - ldr r12, [r10, #244] - eors r12, #1 - beq 0f - - str r12, [r10, #244] - mov r4, r10 @ pass key - mov r5, r1 @ pass # of rounds - add r12, r10, #248 @ pass key schedule - bl _bsaes_key_convert - veor q7, q7, q15 @ fix up last round key - vstmia r12, {q7} - -.align 2 - sub sp, #0x90 @ place for tweak[9] -#endif - - vld1.8 {q8}, [r0] @ initial tweak - adr r2, Lxts_magic - - subs r9, #0x80 - blo Lxts_enc_short - b Lxts_enc_loop - -.align 4 -Lxts_enc_loop: - vldmia r2, {q5} @ load XTS magic - vshr.s64 q6, q8, #63 - mov r0, sp - vand q6, q6, q5 - vadd.u64 q9, q8, q8 - vst1.64 {q8}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q9, #63 - veor q9, q9, q6 - vand q7, q7, q5 - vadd.u64 q10, q9, q9 - vst1.64 {q9}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q10, #63 - veor q10, q10, q7 - vand q6, q6, q5 - vld1.8 {q0}, [r7]! - vadd.u64 q11, q10, q10 - vst1.64 {q10}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q11, #63 - veor q11, q11, q6 - vand q7, q7, q5 - vld1.8 {q1}, [r7]! - veor q0, q0, q8 - vadd.u64 q12, q11, q11 - vst1.64 {q11}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q12, #63 - veor q12, q12, q7 - vand q6, q6, q5 - vld1.8 {q2}, [r7]! - veor q1, q1, q9 - vadd.u64 q13, q12, q12 - vst1.64 {q12}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q13, #63 - veor q13, q13, q6 - vand q7, q7, q5 - vld1.8 {q3}, [r7]! - veor q2, q2, q10 - vadd.u64 q14, q13, q13 - vst1.64 {q13}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q14, #63 - veor q14, q14, q7 - vand q6, q6, q5 - vld1.8 {q4}, [r7]! - veor q3, q3, q11 - vadd.u64 q15, q14, q14 - vst1.64 {q14}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q15, #63 - veor q15, q15, q6 - vand q7, q7, q5 - vld1.8 {q5}, [r7]! - veor q4, q4, q12 - vadd.u64 q8, q15, q15 - vst1.64 {q15}, [r0,:128]! - vswp d15,d14 - veor q8, q8, q7 - vst1.64 {q8}, [r0,:128] @ next round tweak - - vld1.8 {q6,q7}, [r7]! - veor q5, q5, q13 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q6, q6, q14 - mov r5, r1 @ pass rounds - veor q7, q7, q15 - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q6, q11 - vld1.64 {q14,q15}, [r0,:128]! - veor q10, q3, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - veor q12, q2, q14 - vst1.8 {q10,q11}, [r8]! - veor q13, q5, q15 - vst1.8 {q12,q13}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - - subs r9, #0x80 - bpl Lxts_enc_loop - -Lxts_enc_short: - adds r9, #0x70 - bmi Lxts_enc_done - - vldmia r2, {q5} @ load XTS magic - vshr.s64 q7, q8, #63 - mov r0, sp - vand q7, q7, q5 - vadd.u64 q9, q8, q8 - vst1.64 {q8}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q9, #63 - veor q9, q9, q7 - vand q6, q6, q5 - vadd.u64 q10, q9, q9 - vst1.64 {q9}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q10, #63 - veor q10, q10, q6 - vand q7, q7, q5 - vld1.8 {q0}, [r7]! - subs r9, #0x10 - bmi Lxts_enc_1 - vadd.u64 q11, q10, q10 - vst1.64 {q10}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q11, #63 - veor q11, q11, q7 - vand q6, q6, q5 - vld1.8 {q1}, [r7]! - subs r9, #0x10 - bmi Lxts_enc_2 - veor q0, q0, q8 - vadd.u64 q12, q11, q11 - vst1.64 {q11}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q12, #63 - veor q12, q12, q6 - vand q7, q7, q5 - vld1.8 {q2}, [r7]! - subs r9, #0x10 - bmi Lxts_enc_3 - veor q1, q1, q9 - vadd.u64 q13, q12, q12 - vst1.64 {q12}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q13, #63 - veor q13, q13, q7 - vand q6, q6, q5 - vld1.8 {q3}, [r7]! - subs r9, #0x10 - bmi Lxts_enc_4 - veor q2, q2, q10 - vadd.u64 q14, q13, q13 - vst1.64 {q13}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q14, #63 - veor q14, q14, q6 - vand q7, q7, q5 - vld1.8 {q4}, [r7]! - subs r9, #0x10 - bmi Lxts_enc_5 - veor q3, q3, q11 - vadd.u64 q15, q14, q14 - vst1.64 {q14}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q15, #63 - veor q15, q15, q7 - vand q6, q6, q5 - vld1.8 {q5}, [r7]! - subs r9, #0x10 - bmi Lxts_enc_6 - veor q4, q4, q12 - sub r9, #0x10 - vst1.64 {q15}, [r0,:128] @ next round tweak - - vld1.8 {q6}, [r7]! - veor q5, q5, q13 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q6, q6, q14 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q6, q11 - vld1.64 {q14}, [r0,:128]! - veor q10, q3, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - veor q12, q2, q14 - vst1.8 {q10,q11}, [r8]! - vst1.8 {q12}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b Lxts_enc_done -.align 4 -Lxts_enc_6: - veor q4, q4, q12 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q5, q5, q13 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q6, q11 - veor q10, q3, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - vst1.8 {q10,q11}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b Lxts_enc_done - -@ put this in range for both ARM and Thumb mode adr instructions -.align 5 -Lxts_magic: -.quad 1, 0x87 - -.align 5 -Lxts_enc_5: - veor q3, q3, q11 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q4, q4, q12 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12}, [r0,:128]! - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q6, q11 - veor q10, q3, q12 - vst1.8 {q8,q9}, [r8]! - vst1.8 {q10}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b Lxts_enc_done -.align 4 -Lxts_enc_4: - veor q2, q2, q10 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q3, q3, q11 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q6, q11 - vst1.8 {q8,q9}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b Lxts_enc_done -.align 4 -Lxts_enc_3: - veor q1, q1, q9 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q2, q2, q10 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - vst1.8 {q8}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b Lxts_enc_done -.align 4 -Lxts_enc_2: - veor q0, q0, q8 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q1, q1, q9 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - vst1.8 {q0,q1}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b Lxts_enc_done -.align 4 -Lxts_enc_1: - mov r0, sp - veor q0, q0, q8 - mov r1, sp - vst1.8 {q0}, [sp,:128] - mov r2, r10 - mov r4, r3 @ preserve fp - - bl _AES_encrypt - - vld1.8 {q0}, [sp,:128] - veor q0, q0, q8 - vst1.8 {q0}, [r8]! - mov r3, r4 - - vmov q8, q9 @ next round tweak - -Lxts_enc_done: -#ifndef XTS_CHAIN_TWEAK - adds r9, #0x10 - beq Lxts_enc_ret - sub r6, r8, #0x10 - -Lxts_enc_steal: - ldrb r0, [r7], #1 - ldrb r1, [r8, #-0x10] - strb r0, [r8, #-0x10] - strb r1, [r8], #1 - - subs r9, #1 - bhi Lxts_enc_steal - - vld1.8 {q0}, [r6] - mov r0, sp - veor q0, q0, q8 - mov r1, sp - vst1.8 {q0}, [sp,:128] - mov r2, r10 - mov r4, r3 @ preserve fp - - bl _AES_encrypt - - vld1.8 {q0}, [sp,:128] - veor q0, q0, q8 - vst1.8 {q0}, [r6] - mov r3, r4 -#endif - -Lxts_enc_ret: - bic r0, r3, #0xf - vmov.i32 q0, #0 - vmov.i32 q1, #0 -#ifdef XTS_CHAIN_TWEAK - ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak -#endif -Lxts_enc_bzero:@ wipe key schedule [if any] - vstmia sp!, {q0,q1} - cmp sp, r0 - bne Lxts_enc_bzero - - mov sp, r3 -#ifdef XTS_CHAIN_TWEAK - vst1.8 {q8}, [r1] -#endif - VFP_ABI_POP - ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return - - - -.globl _bsaes_xts_decrypt -.private_extern _bsaes_xts_decrypt -#ifdef __thumb2__ -.thumb_func _bsaes_xts_decrypt -#endif -.align 4 -_bsaes_xts_decrypt: - mov ip, sp - stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} @ 0x20 - VFP_ABI_PUSH - mov r6, sp @ future r3 - - mov r7, r0 - mov r8, r1 - mov r9, r2 - mov r10, r3 - - sub r0, sp, #0x10 @ 0x10 - bic r0, #0xf @ align at 16 bytes - mov sp, r0 - -#ifdef XTS_CHAIN_TWEAK - ldr r0, [ip] @ pointer to input tweak -#else - @ generate initial tweak - ldr r0, [ip, #4] @ iv[] - mov r1, sp - ldr r2, [ip, #0] @ key2 - bl _AES_encrypt - mov r0, sp @ pointer to initial tweak -#endif - - ldr r1, [r10, #240] @ get # of rounds - mov r3, r6 -#ifndef BSAES_ASM_EXTENDED_KEY - @ allocate the key schedule on the stack - sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key - @ add r12, #96 @ size of bit-sliced key schedule - sub r12, #48 @ place for tweak[9] - - @ populate the key schedule - mov r4, r10 @ pass key - mov r5, r1 @ pass # of rounds - mov sp, r12 - add r12, #0x90 @ pass key schedule - bl _bsaes_key_convert - add r4, sp, #0x90 - vldmia r4, {q6} - vstmia r12, {q15} @ save last round key - veor q7, q7, q6 @ fix up round 0 key - vstmia r4, {q7} -#else - ldr r12, [r10, #244] - eors r12, #1 - beq 0f - - str r12, [r10, #244] - mov r4, r10 @ pass key - mov r5, r1 @ pass # of rounds - add r12, r10, #248 @ pass key schedule - bl _bsaes_key_convert - add r4, r10, #248 - vldmia r4, {q6} - vstmia r12, {q15} @ save last round key - veor q7, q7, q6 @ fix up round 0 key - vstmia r4, {q7} - -.align 2 - sub sp, #0x90 @ place for tweak[9] -#endif - vld1.8 {q8}, [r0] @ initial tweak - adr r2, Lxts_magic - -#ifndef XTS_CHAIN_TWEAK - tst r9, #0xf @ if not multiple of 16 - it ne @ Thumb2 thing, sanity check in ARM - subne r9, #0x10 @ subtract another 16 bytes -#endif - subs r9, #0x80 - - blo Lxts_dec_short - b Lxts_dec_loop - -.align 4 -Lxts_dec_loop: - vldmia r2, {q5} @ load XTS magic - vshr.s64 q6, q8, #63 - mov r0, sp - vand q6, q6, q5 - vadd.u64 q9, q8, q8 - vst1.64 {q8}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q9, #63 - veor q9, q9, q6 - vand q7, q7, q5 - vadd.u64 q10, q9, q9 - vst1.64 {q9}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q10, #63 - veor q10, q10, q7 - vand q6, q6, q5 - vld1.8 {q0}, [r7]! - vadd.u64 q11, q10, q10 - vst1.64 {q10}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q11, #63 - veor q11, q11, q6 - vand q7, q7, q5 - vld1.8 {q1}, [r7]! - veor q0, q0, q8 - vadd.u64 q12, q11, q11 - vst1.64 {q11}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q12, #63 - veor q12, q12, q7 - vand q6, q6, q5 - vld1.8 {q2}, [r7]! - veor q1, q1, q9 - vadd.u64 q13, q12, q12 - vst1.64 {q12}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q13, #63 - veor q13, q13, q6 - vand q7, q7, q5 - vld1.8 {q3}, [r7]! - veor q2, q2, q10 - vadd.u64 q14, q13, q13 - vst1.64 {q13}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q14, #63 - veor q14, q14, q7 - vand q6, q6, q5 - vld1.8 {q4}, [r7]! - veor q3, q3, q11 - vadd.u64 q15, q14, q14 - vst1.64 {q14}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q15, #63 - veor q15, q15, q6 - vand q7, q7, q5 - vld1.8 {q5}, [r7]! - veor q4, q4, q12 - vadd.u64 q8, q15, q15 - vst1.64 {q15}, [r0,:128]! - vswp d15,d14 - veor q8, q8, q7 - vst1.64 {q8}, [r0,:128] @ next round tweak - - vld1.8 {q6,q7}, [r7]! - veor q5, q5, q13 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q6, q6, q14 - mov r5, r1 @ pass rounds - veor q7, q7, q15 - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q4, q11 - vld1.64 {q14,q15}, [r0,:128]! - veor q10, q2, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - veor q12, q3, q14 - vst1.8 {q10,q11}, [r8]! - veor q13, q5, q15 - vst1.8 {q12,q13}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - - subs r9, #0x80 - bpl Lxts_dec_loop - -Lxts_dec_short: - adds r9, #0x70 - bmi Lxts_dec_done - - vldmia r2, {q5} @ load XTS magic - vshr.s64 q7, q8, #63 - mov r0, sp - vand q7, q7, q5 - vadd.u64 q9, q8, q8 - vst1.64 {q8}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q9, #63 - veor q9, q9, q7 - vand q6, q6, q5 - vadd.u64 q10, q9, q9 - vst1.64 {q9}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q10, #63 - veor q10, q10, q6 - vand q7, q7, q5 - vld1.8 {q0}, [r7]! - subs r9, #0x10 - bmi Lxts_dec_1 - vadd.u64 q11, q10, q10 - vst1.64 {q10}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q11, #63 - veor q11, q11, q7 - vand q6, q6, q5 - vld1.8 {q1}, [r7]! - subs r9, #0x10 - bmi Lxts_dec_2 - veor q0, q0, q8 - vadd.u64 q12, q11, q11 - vst1.64 {q11}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q12, #63 - veor q12, q12, q6 - vand q7, q7, q5 - vld1.8 {q2}, [r7]! - subs r9, #0x10 - bmi Lxts_dec_3 - veor q1, q1, q9 - vadd.u64 q13, q12, q12 - vst1.64 {q12}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q13, #63 - veor q13, q13, q7 - vand q6, q6, q5 - vld1.8 {q3}, [r7]! - subs r9, #0x10 - bmi Lxts_dec_4 - veor q2, q2, q10 - vadd.u64 q14, q13, q13 - vst1.64 {q13}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q14, #63 - veor q14, q14, q6 - vand q7, q7, q5 - vld1.8 {q4}, [r7]! - subs r9, #0x10 - bmi Lxts_dec_5 - veor q3, q3, q11 - vadd.u64 q15, q14, q14 - vst1.64 {q14}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q15, #63 - veor q15, q15, q7 - vand q6, q6, q5 - vld1.8 {q5}, [r7]! - subs r9, #0x10 - bmi Lxts_dec_6 - veor q4, q4, q12 - sub r9, #0x10 - vst1.64 {q15}, [r0,:128] @ next round tweak - - vld1.8 {q6}, [r7]! - veor q5, q5, q13 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q6, q6, q14 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q4, q11 - vld1.64 {q14}, [r0,:128]! - veor q10, q2, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - veor q12, q3, q14 - vst1.8 {q10,q11}, [r8]! - vst1.8 {q12}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b Lxts_dec_done -.align 4 -Lxts_dec_6: - vst1.64 {q14}, [r0,:128] @ next round tweak - - veor q4, q4, q12 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q5, q5, q13 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q4, q11 - veor q10, q2, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - vst1.8 {q10,q11}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b Lxts_dec_done -.align 4 -Lxts_dec_5: - veor q3, q3, q11 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q4, q4, q12 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12}, [r0,:128]! - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q4, q11 - veor q10, q2, q12 - vst1.8 {q8,q9}, [r8]! - vst1.8 {q10}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b Lxts_dec_done -.align 4 -Lxts_dec_4: - veor q2, q2, q10 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q3, q3, q11 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q4, q11 - vst1.8 {q8,q9}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b Lxts_dec_done -.align 4 -Lxts_dec_3: - veor q1, q1, q9 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q2, q2, q10 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - vst1.8 {q8}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b Lxts_dec_done -.align 4 -Lxts_dec_2: - veor q0, q0, q8 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q1, q1, q9 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - vst1.8 {q0,q1}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b Lxts_dec_done -.align 4 -Lxts_dec_1: - mov r0, sp - veor q0, q0, q8 - mov r1, sp - vst1.8 {q0}, [sp,:128] - mov r5, r2 @ preserve magic - mov r2, r10 - mov r4, r3 @ preserve fp - - bl _AES_decrypt - - vld1.8 {q0}, [sp,:128] - veor q0, q0, q8 - vst1.8 {q0}, [r8]! - mov r3, r4 - mov r2, r5 - - vmov q8, q9 @ next round tweak - -Lxts_dec_done: -#ifndef XTS_CHAIN_TWEAK - adds r9, #0x10 - beq Lxts_dec_ret - - @ calculate one round of extra tweak for the stolen ciphertext - vldmia r2, {q5} - vshr.s64 q6, q8, #63 - vand q6, q6, q5 - vadd.u64 q9, q8, q8 - vswp d13,d12 - veor q9, q9, q6 - - @ perform the final decryption with the last tweak value - vld1.8 {q0}, [r7]! - mov r0, sp - veor q0, q0, q9 - mov r1, sp - vst1.8 {q0}, [sp,:128] - mov r2, r10 - mov r4, r3 @ preserve fp - - bl _AES_decrypt - - vld1.8 {q0}, [sp,:128] - veor q0, q0, q9 - vst1.8 {q0}, [r8] - - mov r6, r8 -Lxts_dec_steal: - ldrb r1, [r8] - ldrb r0, [r7], #1 - strb r1, [r8, #0x10] - strb r0, [r8], #1 - - subs r9, #1 - bhi Lxts_dec_steal - - vld1.8 {q0}, [r6] - mov r0, sp - veor q0, q8 - mov r1, sp - vst1.8 {q0}, [sp,:128] - mov r2, r10 - - bl _AES_decrypt - - vld1.8 {q0}, [sp,:128] - veor q0, q0, q8 - vst1.8 {q0}, [r6] - mov r3, r4 -#endif - -Lxts_dec_ret: - bic r0, r3, #0xf - vmov.i32 q0, #0 - vmov.i32 q1, #0 -#ifdef XTS_CHAIN_TWEAK - ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak -#endif -Lxts_dec_bzero:@ wipe key schedule [if any] - vstmia sp!, {q0,q1} - cmp sp, r0 - bne Lxts_dec_bzero - - mov sp, r3 -#ifdef XTS_CHAIN_TWEAK - vst1.8 {q8}, [r1] -#endif - VFP_ABI_POP - ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return - - #endif #endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/linux-arm/crypto/fipsmodule/bsaes-armv7.S b/third_party/boringssl/linux-arm/crypto/fipsmodule/bsaes-armv7.S index 38a029029319f1..ef3ee85aaf7445 100644 --- a/third_party/boringssl/linux-arm/crypto/fipsmodule/bsaes-armv7.S +++ b/third_party/boringssl/linux-arm/crypto/fipsmodule/bsaes-armv7.S @@ -1580,998 +1580,6 @@ bsaes_ctr32_encrypt_blocks: ldmia sp!, {r4,r5,r6,r7,r8, pc} .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks -.globl bsaes_xts_encrypt -.hidden bsaes_xts_encrypt -.type bsaes_xts_encrypt,%function -.align 4 -bsaes_xts_encrypt: - mov ip, sp - stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} @ 0x20 - VFP_ABI_PUSH - mov r6, sp @ future r3 - - mov r7, r0 - mov r8, r1 - mov r9, r2 - mov r10, r3 - - sub r0, sp, #0x10 @ 0x10 - bic r0, #0xf @ align at 16 bytes - mov sp, r0 - -#ifdef XTS_CHAIN_TWEAK - ldr r0, [ip] @ pointer to input tweak -#else - @ generate initial tweak - ldr r0, [ip, #4] @ iv[] - mov r1, sp - ldr r2, [ip, #0] @ key2 - bl AES_encrypt - mov r0,sp @ pointer to initial tweak -#endif - - ldr r1, [r10, #240] @ get # of rounds - mov r3, r6 -#ifndef BSAES_ASM_EXTENDED_KEY - @ allocate the key schedule on the stack - sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key - @ add r12, #96 @ size of bit-sliced key schedule - sub r12, #48 @ place for tweak[9] - - @ populate the key schedule - mov r4, r10 @ pass key - mov r5, r1 @ pass # of rounds - mov sp, r12 - add r12, #0x90 @ pass key schedule - bl _bsaes_key_convert - veor q7, q7, q15 @ fix up last round key - vstmia r12, {q7} @ save last round key -#else - ldr r12, [r10, #244] - eors r12, #1 - beq 0f - - str r12, [r10, #244] - mov r4, r10 @ pass key - mov r5, r1 @ pass # of rounds - add r12, r10, #248 @ pass key schedule - bl _bsaes_key_convert - veor q7, q7, q15 @ fix up last round key - vstmia r12, {q7} - -.align 2 - sub sp, #0x90 @ place for tweak[9] -#endif - - vld1.8 {q8}, [r0] @ initial tweak - adr r2, .Lxts_magic - - subs r9, #0x80 - blo .Lxts_enc_short - b .Lxts_enc_loop - -.align 4 -.Lxts_enc_loop: - vldmia r2, {q5} @ load XTS magic - vshr.s64 q6, q8, #63 - mov r0, sp - vand q6, q6, q5 - vadd.u64 q9, q8, q8 - vst1.64 {q8}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q9, #63 - veor q9, q9, q6 - vand q7, q7, q5 - vadd.u64 q10, q9, q9 - vst1.64 {q9}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q10, #63 - veor q10, q10, q7 - vand q6, q6, q5 - vld1.8 {q0}, [r7]! - vadd.u64 q11, q10, q10 - vst1.64 {q10}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q11, #63 - veor q11, q11, q6 - vand q7, q7, q5 - vld1.8 {q1}, [r7]! - veor q0, q0, q8 - vadd.u64 q12, q11, q11 - vst1.64 {q11}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q12, #63 - veor q12, q12, q7 - vand q6, q6, q5 - vld1.8 {q2}, [r7]! - veor q1, q1, q9 - vadd.u64 q13, q12, q12 - vst1.64 {q12}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q13, #63 - veor q13, q13, q6 - vand q7, q7, q5 - vld1.8 {q3}, [r7]! - veor q2, q2, q10 - vadd.u64 q14, q13, q13 - vst1.64 {q13}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q14, #63 - veor q14, q14, q7 - vand q6, q6, q5 - vld1.8 {q4}, [r7]! - veor q3, q3, q11 - vadd.u64 q15, q14, q14 - vst1.64 {q14}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q15, #63 - veor q15, q15, q6 - vand q7, q7, q5 - vld1.8 {q5}, [r7]! - veor q4, q4, q12 - vadd.u64 q8, q15, q15 - vst1.64 {q15}, [r0,:128]! - vswp d15,d14 - veor q8, q8, q7 - vst1.64 {q8}, [r0,:128] @ next round tweak - - vld1.8 {q6,q7}, [r7]! - veor q5, q5, q13 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q6, q6, q14 - mov r5, r1 @ pass rounds - veor q7, q7, q15 - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q6, q11 - vld1.64 {q14,q15}, [r0,:128]! - veor q10, q3, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - veor q12, q2, q14 - vst1.8 {q10,q11}, [r8]! - veor q13, q5, q15 - vst1.8 {q12,q13}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - - subs r9, #0x80 - bpl .Lxts_enc_loop - -.Lxts_enc_short: - adds r9, #0x70 - bmi .Lxts_enc_done - - vldmia r2, {q5} @ load XTS magic - vshr.s64 q7, q8, #63 - mov r0, sp - vand q7, q7, q5 - vadd.u64 q9, q8, q8 - vst1.64 {q8}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q9, #63 - veor q9, q9, q7 - vand q6, q6, q5 - vadd.u64 q10, q9, q9 - vst1.64 {q9}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q10, #63 - veor q10, q10, q6 - vand q7, q7, q5 - vld1.8 {q0}, [r7]! - subs r9, #0x10 - bmi .Lxts_enc_1 - vadd.u64 q11, q10, q10 - vst1.64 {q10}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q11, #63 - veor q11, q11, q7 - vand q6, q6, q5 - vld1.8 {q1}, [r7]! - subs r9, #0x10 - bmi .Lxts_enc_2 - veor q0, q0, q8 - vadd.u64 q12, q11, q11 - vst1.64 {q11}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q12, #63 - veor q12, q12, q6 - vand q7, q7, q5 - vld1.8 {q2}, [r7]! - subs r9, #0x10 - bmi .Lxts_enc_3 - veor q1, q1, q9 - vadd.u64 q13, q12, q12 - vst1.64 {q12}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q13, #63 - veor q13, q13, q7 - vand q6, q6, q5 - vld1.8 {q3}, [r7]! - subs r9, #0x10 - bmi .Lxts_enc_4 - veor q2, q2, q10 - vadd.u64 q14, q13, q13 - vst1.64 {q13}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q14, #63 - veor q14, q14, q6 - vand q7, q7, q5 - vld1.8 {q4}, [r7]! - subs r9, #0x10 - bmi .Lxts_enc_5 - veor q3, q3, q11 - vadd.u64 q15, q14, q14 - vst1.64 {q14}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q15, #63 - veor q15, q15, q7 - vand q6, q6, q5 - vld1.8 {q5}, [r7]! - subs r9, #0x10 - bmi .Lxts_enc_6 - veor q4, q4, q12 - sub r9, #0x10 - vst1.64 {q15}, [r0,:128] @ next round tweak - - vld1.8 {q6}, [r7]! - veor q5, q5, q13 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q6, q6, q14 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q6, q11 - vld1.64 {q14}, [r0,:128]! - veor q10, q3, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - veor q12, q2, q14 - vst1.8 {q10,q11}, [r8]! - vst1.8 {q12}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_enc_done -.align 4 -.Lxts_enc_6: - veor q4, q4, q12 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q5, q5, q13 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q6, q11 - veor q10, q3, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - vst1.8 {q10,q11}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_enc_done - -@ put this in range for both ARM and Thumb mode adr instructions -.align 5 -.Lxts_magic: -.quad 1, 0x87 - -.align 5 -.Lxts_enc_5: - veor q3, q3, q11 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q4, q4, q12 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12}, [r0,:128]! - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q6, q11 - veor q10, q3, q12 - vst1.8 {q8,q9}, [r8]! - vst1.8 {q10}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_enc_done -.align 4 -.Lxts_enc_4: - veor q2, q2, q10 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q3, q3, q11 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q6, q11 - vst1.8 {q8,q9}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_enc_done -.align 4 -.Lxts_enc_3: - veor q1, q1, q9 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q2, q2, q10 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - vst1.8 {q8}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_enc_done -.align 4 -.Lxts_enc_2: - veor q0, q0, q8 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q1, q1, q9 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - vst1.8 {q0,q1}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_enc_done -.align 4 -.Lxts_enc_1: - mov r0, sp - veor q0, q0, q8 - mov r1, sp - vst1.8 {q0}, [sp,:128] - mov r2, r10 - mov r4, r3 @ preserve fp - - bl AES_encrypt - - vld1.8 {q0}, [sp,:128] - veor q0, q0, q8 - vst1.8 {q0}, [r8]! - mov r3, r4 - - vmov q8, q9 @ next round tweak - -.Lxts_enc_done: -#ifndef XTS_CHAIN_TWEAK - adds r9, #0x10 - beq .Lxts_enc_ret - sub r6, r8, #0x10 - -.Lxts_enc_steal: - ldrb r0, [r7], #1 - ldrb r1, [r8, #-0x10] - strb r0, [r8, #-0x10] - strb r1, [r8], #1 - - subs r9, #1 - bhi .Lxts_enc_steal - - vld1.8 {q0}, [r6] - mov r0, sp - veor q0, q0, q8 - mov r1, sp - vst1.8 {q0}, [sp,:128] - mov r2, r10 - mov r4, r3 @ preserve fp - - bl AES_encrypt - - vld1.8 {q0}, [sp,:128] - veor q0, q0, q8 - vst1.8 {q0}, [r6] - mov r3, r4 -#endif - -.Lxts_enc_ret: - bic r0, r3, #0xf - vmov.i32 q0, #0 - vmov.i32 q1, #0 -#ifdef XTS_CHAIN_TWEAK - ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak -#endif -.Lxts_enc_bzero:@ wipe key schedule [if any] - vstmia sp!, {q0,q1} - cmp sp, r0 - bne .Lxts_enc_bzero - - mov sp, r3 -#ifdef XTS_CHAIN_TWEAK - vst1.8 {q8}, [r1] -#endif - VFP_ABI_POP - ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return - -.size bsaes_xts_encrypt,.-bsaes_xts_encrypt - -.globl bsaes_xts_decrypt -.hidden bsaes_xts_decrypt -.type bsaes_xts_decrypt,%function -.align 4 -bsaes_xts_decrypt: - mov ip, sp - stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} @ 0x20 - VFP_ABI_PUSH - mov r6, sp @ future r3 - - mov r7, r0 - mov r8, r1 - mov r9, r2 - mov r10, r3 - - sub r0, sp, #0x10 @ 0x10 - bic r0, #0xf @ align at 16 bytes - mov sp, r0 - -#ifdef XTS_CHAIN_TWEAK - ldr r0, [ip] @ pointer to input tweak -#else - @ generate initial tweak - ldr r0, [ip, #4] @ iv[] - mov r1, sp - ldr r2, [ip, #0] @ key2 - bl AES_encrypt - mov r0, sp @ pointer to initial tweak -#endif - - ldr r1, [r10, #240] @ get # of rounds - mov r3, r6 -#ifndef BSAES_ASM_EXTENDED_KEY - @ allocate the key schedule on the stack - sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key - @ add r12, #96 @ size of bit-sliced key schedule - sub r12, #48 @ place for tweak[9] - - @ populate the key schedule - mov r4, r10 @ pass key - mov r5, r1 @ pass # of rounds - mov sp, r12 - add r12, #0x90 @ pass key schedule - bl _bsaes_key_convert - add r4, sp, #0x90 - vldmia r4, {q6} - vstmia r12, {q15} @ save last round key - veor q7, q7, q6 @ fix up round 0 key - vstmia r4, {q7} -#else - ldr r12, [r10, #244] - eors r12, #1 - beq 0f - - str r12, [r10, #244] - mov r4, r10 @ pass key - mov r5, r1 @ pass # of rounds - add r12, r10, #248 @ pass key schedule - bl _bsaes_key_convert - add r4, r10, #248 - vldmia r4, {q6} - vstmia r12, {q15} @ save last round key - veor q7, q7, q6 @ fix up round 0 key - vstmia r4, {q7} - -.align 2 - sub sp, #0x90 @ place for tweak[9] -#endif - vld1.8 {q8}, [r0] @ initial tweak - adr r2, .Lxts_magic - -#ifndef XTS_CHAIN_TWEAK - tst r9, #0xf @ if not multiple of 16 - it ne @ Thumb2 thing, sanity check in ARM - subne r9, #0x10 @ subtract another 16 bytes -#endif - subs r9, #0x80 - - blo .Lxts_dec_short - b .Lxts_dec_loop - -.align 4 -.Lxts_dec_loop: - vldmia r2, {q5} @ load XTS magic - vshr.s64 q6, q8, #63 - mov r0, sp - vand q6, q6, q5 - vadd.u64 q9, q8, q8 - vst1.64 {q8}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q9, #63 - veor q9, q9, q6 - vand q7, q7, q5 - vadd.u64 q10, q9, q9 - vst1.64 {q9}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q10, #63 - veor q10, q10, q7 - vand q6, q6, q5 - vld1.8 {q0}, [r7]! - vadd.u64 q11, q10, q10 - vst1.64 {q10}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q11, #63 - veor q11, q11, q6 - vand q7, q7, q5 - vld1.8 {q1}, [r7]! - veor q0, q0, q8 - vadd.u64 q12, q11, q11 - vst1.64 {q11}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q12, #63 - veor q12, q12, q7 - vand q6, q6, q5 - vld1.8 {q2}, [r7]! - veor q1, q1, q9 - vadd.u64 q13, q12, q12 - vst1.64 {q12}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q13, #63 - veor q13, q13, q6 - vand q7, q7, q5 - vld1.8 {q3}, [r7]! - veor q2, q2, q10 - vadd.u64 q14, q13, q13 - vst1.64 {q13}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q14, #63 - veor q14, q14, q7 - vand q6, q6, q5 - vld1.8 {q4}, [r7]! - veor q3, q3, q11 - vadd.u64 q15, q14, q14 - vst1.64 {q14}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q15, #63 - veor q15, q15, q6 - vand q7, q7, q5 - vld1.8 {q5}, [r7]! - veor q4, q4, q12 - vadd.u64 q8, q15, q15 - vst1.64 {q15}, [r0,:128]! - vswp d15,d14 - veor q8, q8, q7 - vst1.64 {q8}, [r0,:128] @ next round tweak - - vld1.8 {q6,q7}, [r7]! - veor q5, q5, q13 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q6, q6, q14 - mov r5, r1 @ pass rounds - veor q7, q7, q15 - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q4, q11 - vld1.64 {q14,q15}, [r0,:128]! - veor q10, q2, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - veor q12, q3, q14 - vst1.8 {q10,q11}, [r8]! - veor q13, q5, q15 - vst1.8 {q12,q13}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - - subs r9, #0x80 - bpl .Lxts_dec_loop - -.Lxts_dec_short: - adds r9, #0x70 - bmi .Lxts_dec_done - - vldmia r2, {q5} @ load XTS magic - vshr.s64 q7, q8, #63 - mov r0, sp - vand q7, q7, q5 - vadd.u64 q9, q8, q8 - vst1.64 {q8}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q9, #63 - veor q9, q9, q7 - vand q6, q6, q5 - vadd.u64 q10, q9, q9 - vst1.64 {q9}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q10, #63 - veor q10, q10, q6 - vand q7, q7, q5 - vld1.8 {q0}, [r7]! - subs r9, #0x10 - bmi .Lxts_dec_1 - vadd.u64 q11, q10, q10 - vst1.64 {q10}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q11, #63 - veor q11, q11, q7 - vand q6, q6, q5 - vld1.8 {q1}, [r7]! - subs r9, #0x10 - bmi .Lxts_dec_2 - veor q0, q0, q8 - vadd.u64 q12, q11, q11 - vst1.64 {q11}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q12, #63 - veor q12, q12, q6 - vand q7, q7, q5 - vld1.8 {q2}, [r7]! - subs r9, #0x10 - bmi .Lxts_dec_3 - veor q1, q1, q9 - vadd.u64 q13, q12, q12 - vst1.64 {q12}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q13, #63 - veor q13, q13, q7 - vand q6, q6, q5 - vld1.8 {q3}, [r7]! - subs r9, #0x10 - bmi .Lxts_dec_4 - veor q2, q2, q10 - vadd.u64 q14, q13, q13 - vst1.64 {q13}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q14, #63 - veor q14, q14, q6 - vand q7, q7, q5 - vld1.8 {q4}, [r7]! - subs r9, #0x10 - bmi .Lxts_dec_5 - veor q3, q3, q11 - vadd.u64 q15, q14, q14 - vst1.64 {q14}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q15, #63 - veor q15, q15, q7 - vand q6, q6, q5 - vld1.8 {q5}, [r7]! - subs r9, #0x10 - bmi .Lxts_dec_6 - veor q4, q4, q12 - sub r9, #0x10 - vst1.64 {q15}, [r0,:128] @ next round tweak - - vld1.8 {q6}, [r7]! - veor q5, q5, q13 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q6, q6, q14 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q4, q11 - vld1.64 {q14}, [r0,:128]! - veor q10, q2, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - veor q12, q3, q14 - vst1.8 {q10,q11}, [r8]! - vst1.8 {q12}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_dec_done -.align 4 -.Lxts_dec_6: - vst1.64 {q14}, [r0,:128] @ next round tweak - - veor q4, q4, q12 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q5, q5, q13 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q4, q11 - veor q10, q2, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - vst1.8 {q10,q11}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_dec_done -.align 4 -.Lxts_dec_5: - veor q3, q3, q11 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q4, q4, q12 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12}, [r0,:128]! - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q4, q11 - veor q10, q2, q12 - vst1.8 {q8,q9}, [r8]! - vst1.8 {q10}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_dec_done -.align 4 -.Lxts_dec_4: - veor q2, q2, q10 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q3, q3, q11 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q4, q11 - vst1.8 {q8,q9}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_dec_done -.align 4 -.Lxts_dec_3: - veor q1, q1, q9 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q2, q2, q10 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - vst1.8 {q8}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_dec_done -.align 4 -.Lxts_dec_2: - veor q0, q0, q8 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q1, q1, q9 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - vst1.8 {q0,q1}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_dec_done -.align 4 -.Lxts_dec_1: - mov r0, sp - veor q0, q0, q8 - mov r1, sp - vst1.8 {q0}, [sp,:128] - mov r5, r2 @ preserve magic - mov r2, r10 - mov r4, r3 @ preserve fp - - bl AES_decrypt - - vld1.8 {q0}, [sp,:128] - veor q0, q0, q8 - vst1.8 {q0}, [r8]! - mov r3, r4 - mov r2, r5 - - vmov q8, q9 @ next round tweak - -.Lxts_dec_done: -#ifndef XTS_CHAIN_TWEAK - adds r9, #0x10 - beq .Lxts_dec_ret - - @ calculate one round of extra tweak for the stolen ciphertext - vldmia r2, {q5} - vshr.s64 q6, q8, #63 - vand q6, q6, q5 - vadd.u64 q9, q8, q8 - vswp d13,d12 - veor q9, q9, q6 - - @ perform the final decryption with the last tweak value - vld1.8 {q0}, [r7]! - mov r0, sp - veor q0, q0, q9 - mov r1, sp - vst1.8 {q0}, [sp,:128] - mov r2, r10 - mov r4, r3 @ preserve fp - - bl AES_decrypt - - vld1.8 {q0}, [sp,:128] - veor q0, q0, q9 - vst1.8 {q0}, [r8] - - mov r6, r8 -.Lxts_dec_steal: - ldrb r1, [r8] - ldrb r0, [r7], #1 - strb r1, [r8, #0x10] - strb r0, [r8], #1 - - subs r9, #1 - bhi .Lxts_dec_steal - - vld1.8 {q0}, [r6] - mov r0, sp - veor q0, q8 - mov r1, sp - vst1.8 {q0}, [sp,:128] - mov r2, r10 - - bl AES_decrypt - - vld1.8 {q0}, [sp,:128] - veor q0, q0, q8 - vst1.8 {q0}, [r6] - mov r3, r4 -#endif - -.Lxts_dec_ret: - bic r0, r3, #0xf - vmov.i32 q0, #0 - vmov.i32 q1, #0 -#ifdef XTS_CHAIN_TWEAK - ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak -#endif -.Lxts_dec_bzero:@ wipe key schedule [if any] - vstmia sp!, {q0,q1} - cmp sp, r0 - bne .Lxts_dec_bzero - - mov sp, r3 -#ifdef XTS_CHAIN_TWEAK - vst1.8 {q8}, [r1] -#endif - VFP_ABI_POP - ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return - -.size bsaes_xts_decrypt,.-bsaes_xts_decrypt #endif #endif #endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S index 423c697ed0ca08..4e2267bb2344b4 100644 --- a/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S +++ b/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S @@ -79,13 +79,13 @@ ChaCha20_ctr32: .cfi_adjust_cfa_offset 88 .Lctr32_body: -#movdqa .Lsigma(%rip),%xmm0 + movdqu (%rcx),%xmm1 movdqu 16(%rcx),%xmm2 movdqu (%r8),%xmm3 movdqa .Lone(%rip),%xmm4 -#movdqa %xmm0,4*0(%rsp) + movdqa %xmm1,16(%rsp) movdqa %xmm2,32(%rsp) movdqa %xmm3,48(%rsp) @@ -878,9 +878,9 @@ ChaCha20_4x: cmpq $64,%rdx jae .L64_or_more4x -#movdqa 0x00(%rsp),%xmm6 + xorq %r10,%r10 -#movdqa %xmm6,0x00(%rsp) + movdqa %xmm12,16(%rsp) movdqa %xmm4,32(%rsp) movdqa %xmm0,48(%rsp) @@ -1034,7 +1034,7 @@ ChaCha20_8x: andq $-32,%rsp vzeroupper -############### + diff --git a/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S index 32c315140ca01b..3eb1688c439cc3 100644 --- a/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S +++ b/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S @@ -213,7 +213,7 @@ aesgcmsiv_htable_polyval: vpclmulqdq $0x10,(%rdi),%xmm0,%xmm6 vpxor %xmm6,%xmm5,%xmm5 -######################################################### + vmovdqu 96(%rsi),%xmm0 vpclmulqdq $0x01,16(%rdi),%xmm0,%xmm6 vpxor %xmm6,%xmm5,%xmm5 @@ -225,7 +225,7 @@ aesgcmsiv_htable_polyval: vpxor %xmm6,%xmm5,%xmm5 -######################################################### + vmovdqu 80(%rsi),%xmm0 vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7 @@ -242,7 +242,7 @@ aesgcmsiv_htable_polyval: vpxor %xmm7,%xmm1,%xmm1 -######################################################### + vmovdqu 64(%rsi),%xmm0 vpclmulqdq $0x01,48(%rdi),%xmm0,%xmm6 @@ -254,7 +254,7 @@ aesgcmsiv_htable_polyval: vpclmulqdq $0x10,48(%rdi),%xmm0,%xmm6 vpxor %xmm6,%xmm5,%xmm5 -######################################################### + vmovdqu 48(%rsi),%xmm0 vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7 @@ -271,7 +271,7 @@ aesgcmsiv_htable_polyval: vpxor %xmm7,%xmm1,%xmm1 -######################################################### + vmovdqu 32(%rsi),%xmm0 vpclmulqdq $0x01,80(%rdi),%xmm0,%xmm6 @@ -285,7 +285,7 @@ aesgcmsiv_htable_polyval: vpxor %xmm9,%xmm1,%xmm1 -######################################################### + vmovdqu 16(%rsi),%xmm0 vpclmulqdq $0x01,96(%rdi),%xmm0,%xmm6 @@ -297,7 +297,7 @@ aesgcmsiv_htable_polyval: vpclmulqdq $0x10,96(%rdi),%xmm0,%xmm6 vpxor %xmm6,%xmm5,%xmm5 -######################################################### + vmovdqu 0(%rsi),%xmm0 vpxor %xmm1,%xmm0,%xmm0 @@ -310,7 +310,7 @@ aesgcmsiv_htable_polyval: vpclmulqdq $0x10,112(%rdi),%xmm0,%xmm6 vpxor %xmm6,%xmm5,%xmm5 -######################################################### + vpsrldq $8,%xmm5,%xmm6 vpslldq $8,%xmm5,%xmm5 @@ -320,7 +320,7 @@ aesgcmsiv_htable_polyval: leaq 128(%rsi),%rsi jmp .Lhtable_polyval_main_loop -######################################################### + .Lhtable_polyval_out: vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6 @@ -488,10 +488,10 @@ aes128gcmsiv_aes_ks_enc_x1: vmovdqa con1(%rip),%xmm0 vmovdqa mask(%rip),%xmm15 - vpshufb %xmm15,%xmm1,%xmm2#!!saving mov instruction to xmm2 + vpshufb %xmm15,%xmm1,%xmm2 vaesenclast %xmm0,%xmm2,%xmm2 vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3#!!saving mov instruction to xmm3 + vpsllq $32,%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 vpshufb con3(%rip),%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 @@ -500,10 +500,10 @@ aes128gcmsiv_aes_ks_enc_x1: vaesenc %xmm1,%xmm4,%xmm4 vmovdqa %xmm1,16(%rdx) - vpshufb %xmm15,%xmm1,%xmm2#!!saving mov instruction to xmm2 + vpshufb %xmm15,%xmm1,%xmm2 vaesenclast %xmm0,%xmm2,%xmm2 vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3#!!saving mov instruction to xmm3 + vpsllq $32,%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 vpshufb con3(%rip),%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 @@ -512,10 +512,10 @@ aes128gcmsiv_aes_ks_enc_x1: vaesenc %xmm1,%xmm4,%xmm4 vmovdqa %xmm1,32(%rdx) - vpshufb %xmm15,%xmm1,%xmm2#!!saving mov instruction to xmm2 + vpshufb %xmm15,%xmm1,%xmm2 vaesenclast %xmm0,%xmm2,%xmm2 vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3#!!saving mov instruction to xmm3 + vpsllq $32,%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 vpshufb con3(%rip),%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 @@ -524,10 +524,10 @@ aes128gcmsiv_aes_ks_enc_x1: vaesenc %xmm1,%xmm4,%xmm4 vmovdqa %xmm1,48(%rdx) - vpshufb %xmm15,%xmm1,%xmm2#!!saving mov instruction to xmm2 + vpshufb %xmm15,%xmm1,%xmm2 vaesenclast %xmm0,%xmm2,%xmm2 vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3#!!saving mov instruction to xmm3 + vpsllq $32,%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 vpshufb con3(%rip),%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 @@ -536,10 +536,10 @@ aes128gcmsiv_aes_ks_enc_x1: vaesenc %xmm1,%xmm4,%xmm4 vmovdqa %xmm1,64(%rdx) - vpshufb %xmm15,%xmm1,%xmm2#!!saving mov instruction to xmm2 + vpshufb %xmm15,%xmm1,%xmm2 vaesenclast %xmm0,%xmm2,%xmm2 vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3#!!saving mov instruction to xmm3 + vpsllq $32,%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 vpshufb con3(%rip),%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 @@ -548,10 +548,10 @@ aes128gcmsiv_aes_ks_enc_x1: vaesenc %xmm1,%xmm4,%xmm4 vmovdqa %xmm1,80(%rdx) - vpshufb %xmm15,%xmm1,%xmm2#!!saving mov instruction to xmm2 + vpshufb %xmm15,%xmm1,%xmm2 vaesenclast %xmm0,%xmm2,%xmm2 vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3#!!saving mov instruction to xmm3 + vpsllq $32,%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 vpshufb con3(%rip),%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 @@ -560,10 +560,10 @@ aes128gcmsiv_aes_ks_enc_x1: vaesenc %xmm1,%xmm4,%xmm4 vmovdqa %xmm1,96(%rdx) - vpshufb %xmm15,%xmm1,%xmm2#!!saving mov instruction to xmm2 + vpshufb %xmm15,%xmm1,%xmm2 vaesenclast %xmm0,%xmm2,%xmm2 vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3#!!saving mov instruction to xmm3 + vpsllq $32,%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 vpshufb con3(%rip),%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 @@ -572,10 +572,10 @@ aes128gcmsiv_aes_ks_enc_x1: vaesenc %xmm1,%xmm4,%xmm4 vmovdqa %xmm1,112(%rdx) - vpshufb %xmm15,%xmm1,%xmm2#!!saving mov instruction to xmm2 + vpshufb %xmm15,%xmm1,%xmm2 vaesenclast %xmm0,%xmm2,%xmm2 vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3#!!saving mov instruction to xmm3 + vpsllq $32,%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 vpshufb con3(%rip),%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 @@ -587,10 +587,10 @@ aes128gcmsiv_aes_ks_enc_x1: vmovdqa con2(%rip),%xmm0 - vpshufb %xmm15,%xmm1,%xmm2#!!saving mov instruction to xmm2 + vpshufb %xmm15,%xmm1,%xmm2 vaesenclast %xmm0,%xmm2,%xmm2 vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3#!!saving mov instruction to xmm3 + vpsllq $32,%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 vpshufb con3(%rip),%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 @@ -599,9 +599,9 @@ aes128gcmsiv_aes_ks_enc_x1: vaesenc %xmm1,%xmm4,%xmm4 vmovdqa %xmm1,144(%rdx) - vpshufb %xmm15,%xmm1,%xmm2#!!saving mov instruction to xmm2 + vpshufb %xmm15,%xmm1,%xmm2 vaesenclast %xmm0,%xmm2,%xmm2 - vpsllq $32,%xmm1,%xmm3#!!saving mov instruction to xmm3 + vpsllq $32,%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 vpshufb con3(%rip),%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 @@ -733,7 +733,7 @@ aes128gcmsiv_enc_msg_x4: vmovdqa (%rdx),%xmm15 - vpor OR_MASK(%rip),%xmm15,%xmm15#IV = [1]TAG[126...32][00..00] + vpor OR_MASK(%rip),%xmm15,%xmm15 vmovdqu four(%rip),%xmm4 vmovdqa %xmm15,%xmm0 diff --git a/third_party/boringssl/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S b/third_party/boringssl/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S index ef4462d025cc90..677335b9633618 100644 --- a/third_party/boringssl/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S +++ b/third_party/boringssl/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S @@ -1343,7 +1343,7 @@ open_sse_main_loop: leaq 128(%rdi),%rdi jmp open_sse_tail_64_dec_loop 3: -############################################################################### + movdqa .chacha20_consts(%rip),%xmm0 movdqa 48(%rbp),%xmm4 movdqa 64(%rbp),%xmm8 @@ -1698,7 +1698,7 @@ open_sse_main_loop: subq $192,%rbx leaq 192(%rsi),%rsi leaq 192(%rdi),%rdi -############################################################################### + open_sse_tail_64_dec_loop: cmpq $16,%rbx @@ -1868,7 +1868,7 @@ open_sse_finalize: .cfi_adjust_cfa_offset -8 .byte 0xf3,0xc3 .cfi_adjust_cfa_offset (8 * 6) + 288 + 32 -############################################################################### + open_sse_128: movdqu .chacha20_consts(%rip),%xmm0 movdqa %xmm0,%xmm1 @@ -2102,8 +2102,8 @@ open_sse_128: .size chacha20_poly1305_open, .-chacha20_poly1305_open .cfi_endproc -################################################################################ -################################################################################ + + .globl chacha20_poly1305_seal .hidden chacha20_poly1305_seal @@ -2839,7 +2839,7 @@ chacha20_poly1305_seal: movq $6,%rcx cmpq $64,%rbx jg 3f -############################################################################### + seal_sse_tail_64: movdqa .chacha20_consts(%rip),%xmm0 movdqa 48(%rbp),%xmm4 @@ -2991,7 +2991,7 @@ seal_sse_tail_64: 3: cmpq $128,%rbx jg 3f -############################################################################### + seal_sse_tail_128: movdqa .chacha20_consts(%rip),%xmm0 movdqa 48(%rbp),%xmm4 @@ -3208,7 +3208,7 @@ seal_sse_tail_128: leaq 64(%rsi),%rsi jmp seal_sse_128_seal_hash 3: -############################################################################### + seal_sse_tail_192: movdqa .chacha20_consts(%rip),%xmm0 movdqa 48(%rbp),%xmm4 @@ -3487,7 +3487,7 @@ seal_sse_tail_192: movq $128,%rcx subq $128,%rbx leaq 128(%rsi),%rsi -############################################################################### + seal_sse_128_seal_hash: cmpq $16,%rcx jb seal_sse_128_seal @@ -3632,7 +3632,7 @@ seal_sse_tail_16: -# + movq 288+32(%rsp),%r9 @@ -3936,7 +3936,7 @@ do_length_block: .cfi_adjust_cfa_offset -8 .byte 0xf3,0xc3 .cfi_adjust_cfa_offset (8 * 6) + 288 + 32 -################################################################################ + seal_sse_128: movdqu .chacha20_consts(%rip),%xmm0 movdqa %xmm0,%xmm1 @@ -4107,7 +4107,7 @@ seal_sse_128: jmp seal_sse_128_seal .size chacha20_poly1305_seal, .-chacha20_poly1305_seal -############################################################################### + .type chacha20_poly1305_open_avx2,@function .align 64 chacha20_poly1305_open_avx2: @@ -5819,7 +5819,7 @@ open_avx2_tail: vmovdqa %xmm0,%xmm1 jb 1f subq $16,%rbx -#load for decryption + vpxor (%rsi),%xmm0,%xmm1 vmovdqu %xmm1,(%rdi) leaq 16(%rsi),%rsi @@ -5829,7 +5829,7 @@ open_avx2_tail: 1: vzeroupper jmp open_sse_tail_16 -############################################################################### + open_avx2_192: vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 @@ -6101,7 +6101,7 @@ open_avx2_short_tail_32: 1: vzeroupper jmp open_sse_tail_16 -############################################################################### + open_avx2_320: vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 @@ -6264,8 +6264,8 @@ open_avx2_320: vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 jmp open_avx2_short .size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2 -############################################################################### -############################################################################### + + .type chacha20_poly1305_seal_avx2,@function .align 64 chacha20_poly1305_seal_avx2: @@ -7335,7 +7335,7 @@ chacha20_poly1305_seal_avx2: xorq %r8,%r8 cmpq $128,%rbx ja 3f -############################################################################### + seal_avx2_tail_128: vmovdqa .chacha20_consts(%rip),%ymm0 vmovdqa 64(%rbp),%ymm4 @@ -7529,7 +7529,7 @@ seal_avx2_tail_128: 3: cmpq $256,%rbx ja 3f -############################################################################### + seal_avx2_tail_256: vmovdqa .chacha20_consts(%rip),%ymm0 vmovdqa 64(%rbp),%ymm4 @@ -7785,7 +7785,7 @@ seal_avx2_tail_256: 3: cmpq $384,%rbx ja seal_avx2_tail_512 -############################################################################### + seal_avx2_tail_384: vmovdqa .chacha20_consts(%rip),%ymm0 vmovdqa 64(%rbp),%ymm4 @@ -8097,7 +8097,7 @@ seal_avx2_tail_384: leaq 256(%rsi),%rsi subq $256,%rbx jmp seal_avx2_hash -############################################################################### + seal_avx2_tail_512: vmovdqa .chacha20_consts(%rip),%ymm0 vmovdqa 64(%rbp),%ymm4 @@ -8499,7 +8499,7 @@ seal_avx2_tail_512: leaq 384(%rsi),%rsi subq $384,%rbx jmp seal_avx2_hash -################################################################################ + seal_avx2_320: vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 @@ -8661,7 +8661,7 @@ seal_avx2_320: vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 jmp seal_avx2_short -################################################################################ + seal_avx2_192: vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aes-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aes-x86_64.S index 53875e30ba3cee..f45e010e281063 100644 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aes-x86_64.S +++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aes-x86_64.S @@ -91,8 +91,8 @@ _x86_64_AES_encrypt: movzbl %bh,%edi movzbl %ch,%ebp movzbl 2(%r14,%rsi,8),%r8d - movl 0(%r14,%rdi,8),%edi#%r10d - movl 0(%r14,%rbp,8),%ebp#%r11d + movl 0(%r14,%rdi,8),%edi + movl 0(%r14,%rbp,8),%ebp andl $0x0000ff00,%edi andl $0x0000ff00,%ebp @@ -104,8 +104,8 @@ _x86_64_AES_encrypt: movzbl %dh,%esi movzbl %ah,%edi shrl $16,%edx - movl 0(%r14,%rsi,8),%esi#%r12d - movl 0(%r14,%rdi,8),%edi#%r8d + movl 0(%r14,%rsi,8),%esi + movl 0(%r14,%rdi,8),%edi andl $0x0000ff00,%esi andl $0x0000ff00,%edi @@ -117,9 +117,9 @@ _x86_64_AES_encrypt: movzbl %cl,%esi movzbl %dl,%edi movzbl %al,%ebp - movl 0(%r14,%rsi,8),%esi#%r10d - movl 0(%r14,%rdi,8),%edi#%r11d - movl 0(%r14,%rbp,8),%ebp#%r12d + movl 0(%r14,%rsi,8),%esi + movl 0(%r14,%rdi,8),%edi + movl 0(%r14,%rbp,8),%ebp andl $0x00ff0000,%esi andl $0x00ff0000,%edi @@ -132,9 +132,9 @@ _x86_64_AES_encrypt: movzbl %bl,%esi movzbl %dh,%edi movzbl %ah,%ebp - movl 0(%r14,%rsi,8),%esi#%r8d - movl 2(%r14,%rdi,8),%edi#%r10d - movl 2(%r14,%rbp,8),%ebp#%r11d + movl 0(%r14,%rsi,8),%esi + movl 2(%r14,%rdi,8),%edi + movl 2(%r14,%rbp,8),%ebp andl $0x00ff0000,%esi andl $0xff000000,%edi @@ -147,8 +147,8 @@ _x86_64_AES_encrypt: movzbl %bh,%esi movzbl %ch,%edi movl 16+12(%r15),%edx - movl 2(%r14,%rsi,8),%esi#%r12d - movl 2(%r14,%rdi,8),%edi#%r8d + movl 2(%r14,%rsi,8),%esi + movl 2(%r14,%rdi,8),%edi movl 16+0(%r15),%eax andl $0xff000000,%esi @@ -199,12 +199,12 @@ _x86_64_AES_encrypt_compact: movzbl (%r14,%r12,1),%r12d movzbl (%r14,%r8,1),%r8d - movzbl (%r14,%rsi,1),%r9d#%r10d + movzbl (%r14,%rsi,1),%r9d movzbl %ah,%esi - movzbl (%r14,%rdi,1),%r13d#%r11d + movzbl (%r14,%rdi,1),%r13d movzbl %cl,%edi - movzbl (%r14,%rbp,1),%ebp#%r12d - movzbl (%r14,%rsi,1),%esi#%r8d + movzbl (%r14,%rbp,1),%ebp + movzbl (%r14,%rsi,1),%esi shll $8,%r9d shrl $16,%edx @@ -216,16 +216,16 @@ _x86_64_AES_encrypt_compact: xorl %r13d,%r11d shll $8,%ebp movzbl %al,%r13d - movzbl (%r14,%rdi,1),%edi#%r10d + movzbl (%r14,%rdi,1),%edi xorl %ebp,%r12d shll $8,%esi movzbl %bl,%ebp shll $16,%edi xorl %esi,%r8d - movzbl (%r14,%r9,1),%r9d#%r11d + movzbl (%r14,%r9,1),%r9d movzbl %dh,%esi - movzbl (%r14,%r13,1),%r13d#%r12d + movzbl (%r14,%r13,1),%r13d xorl %edi,%r10d shrl $8,%ecx @@ -234,11 +234,11 @@ _x86_64_AES_encrypt_compact: shrl $8,%ebx shll $16,%r13d xorl %r9d,%r11d - movzbl (%r14,%rbp,1),%ebp#%r8d - movzbl (%r14,%rsi,1),%esi#%r10d - movzbl (%r14,%rdi,1),%edi#%r11d - movzbl (%r14,%rcx,1),%edx#%r8d - movzbl (%r14,%rbx,1),%ecx#%r12d + movzbl (%r14,%rbp,1),%ebp + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%edi + movzbl (%r14,%rcx,1),%edx + movzbl (%r14,%rbx,1),%ecx shll $16,%ebp xorl %r13d,%r12d @@ -504,8 +504,8 @@ _x86_64_AES_decrypt: movzbl %dh,%edi movzbl %ah,%ebp movzbl (%r14,%rsi,1),%r8d - movzbl (%r14,%rdi,1),%edi#%r10d - movzbl (%r14,%rbp,1),%ebp#%r11d + movzbl (%r14,%rdi,1),%edi + movzbl (%r14,%rbp,1),%ebp shll $8,%edi shll $8,%ebp @@ -517,8 +517,8 @@ _x86_64_AES_decrypt: movzbl %bh,%esi movzbl %ch,%edi shrl $16,%eax - movzbl (%r14,%rsi,1),%esi#%r12d - movzbl (%r14,%rdi,1),%edi#%r8d + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%edi shll $8,%esi shll $8,%edi @@ -530,9 +530,9 @@ _x86_64_AES_decrypt: movzbl %cl,%esi movzbl %dl,%edi movzbl %al,%ebp - movzbl (%r14,%rsi,1),%esi#%r10d - movzbl (%r14,%rdi,1),%edi#%r11d - movzbl (%r14,%rbp,1),%ebp#%r12d + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%edi + movzbl (%r14,%rbp,1),%ebp shll $16,%esi shll $16,%edi @@ -545,9 +545,9 @@ _x86_64_AES_decrypt: movzbl %bl,%esi movzbl %bh,%edi movzbl %ch,%ebp - movzbl (%r14,%rsi,1),%esi#%r8d - movzbl (%r14,%rdi,1),%edi#%r10d - movzbl (%r14,%rbp,1),%ebp#%r11d + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%edi + movzbl (%r14,%rbp,1),%ebp shll $16,%esi shll $24,%edi @@ -560,8 +560,8 @@ _x86_64_AES_decrypt: movzbl %dh,%esi movzbl %ah,%edi movl 16+12(%r15),%edx - movzbl (%r14,%rsi,1),%esi#%r12d - movzbl (%r14,%rdi,1),%edi#%r8d + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%edi movl 16+0(%r15),%eax shll $24,%esi @@ -614,11 +614,11 @@ _x86_64_AES_decrypt_compact: movzbl (%r14,%r12,1),%r12d movzbl (%r14,%r8,1),%r8d - movzbl (%r14,%rsi,1),%r9d#%r10d + movzbl (%r14,%rsi,1),%r9d movzbl %ch,%esi - movzbl (%r14,%rdi,1),%r13d#%r11d - movzbl (%r14,%rbp,1),%ebp#%r12d - movzbl (%r14,%rsi,1),%esi#%r8d + movzbl (%r14,%rdi,1),%r13d + movzbl (%r14,%rbp,1),%ebp + movzbl (%r14,%rsi,1),%esi shrl $16,%ecx shll $8,%r13d @@ -633,17 +633,17 @@ _x86_64_AES_decrypt_compact: xorl %r13d,%r11d shll $8,%esi movzbl %al,%r13d - movzbl (%r14,%rdi,1),%edi#%r10d + movzbl (%r14,%rdi,1),%edi xorl %ebp,%r12d movzbl %bl,%ebp shll $16,%edi xorl %esi,%r8d - movzbl (%r14,%r9,1),%r9d#%r11d + movzbl (%r14,%r9,1),%r9d movzbl %bh,%esi - movzbl (%r14,%rbp,1),%ebp#%r8d + movzbl (%r14,%rbp,1),%ebp xorl %edi,%r10d - movzbl (%r14,%r13,1),%r13d#%r12d + movzbl (%r14,%r13,1),%r13d movzbl %ch,%edi shll $16,%ebp @@ -655,10 +655,10 @@ _x86_64_AES_decrypt_compact: shrl $8,%eax xorl %r13d,%r12d - movzbl (%r14,%rsi,1),%esi#%r10d - movzbl (%r14,%rdi,1),%ebx#%r11d - movzbl (%r14,%rbp,1),%ecx#%r12d - movzbl (%r14,%rax,1),%edx#%r8d + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%ebx + movzbl (%r14,%rbp,1),%ecx + movzbl (%r14,%rax,1),%edx movl %r10d,%eax shll $24,%esi @@ -1464,7 +1464,7 @@ aes_nohw_cbc_encrypt: xchgq %rsp,%r15 .cfi_def_cfa_register %r15 -#add $8,%rsp + movq %r15,16(%rsp) .cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x40 .Lcbc_fast_body: @@ -1515,7 +1515,7 @@ aes_nohw_cbc_encrypt: cmpq $0,%rbx je .LFAST_DECRYPT -#----------------------------- ENCRYPT -----------------------------# + movl 0(%rbp),%eax movl 4(%rbp),%ebx movl 8(%rbp),%ecx @@ -1553,7 +1553,7 @@ aes_nohw_cbc_encrypt: jmp .Lcbc_fast_cleanup -#----------------------------- DECRYPT -----------------------------# + .align 16 .LFAST_DECRYPT: cmpq %r8,%r9 @@ -1661,7 +1661,7 @@ aes_nohw_cbc_encrypt: jmp .Lcbc_exit -#--------------------------- SLOW ROUTINE ---------------------------# + .align 16 .Lcbc_slow_prologue: .cfi_restore_state @@ -1677,14 +1677,14 @@ aes_nohw_cbc_encrypt: xchgq %rsp,%rbp .cfi_def_cfa_register %rbp -#add $8,%rsp + movq %rbp,16(%rsp) .cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x40 .Lcbc_slow_body: -#mov %rdi,24(%rsp) -#mov %rsi,32(%rsp) -#mov %rdx,40(%rsp) -#mov %rcx,48(%rsp) + + + + movq %r8,56(%rsp) movq %r8,%rbp movq %r9,%rbx @@ -1709,7 +1709,7 @@ aes_nohw_cbc_encrypt: cmpq $0,%rbx je .LSLOW_DECRYPT -#--------------------------- SLOW ENCRYPT ---------------------------# + testq $-16,%r10 movl 0(%rbp),%eax movl 4(%rbp),%ebx @@ -1770,7 +1770,7 @@ aes_nohw_cbc_encrypt: movq %r11,%rax movq %r12,%rcx jmp .Lcbc_slow_enc_loop -#--------------------------- SLOW DECRYPT ---------------------------# + .align 16 .LSLOW_DECRYPT: shrq $3,%rax diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S index 038a9d57a0d322..240cb5d4730f30 100644 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S +++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S @@ -57,7 +57,7 @@ _aesni_ctr32_ghash_6x: -# + diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S index 0db3f55004b3c4..42e55307ff2fc1 100644 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S +++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S @@ -551,7 +551,7 @@ aes_hw_ecb_encrypt: movl %eax,%r10d testl %r8d,%r8d jz .Lecb_decrypt -#--------------------------- ECB ENCRYPT ------------------------------# + cmpq $0x80,%rdx jb .Lecb_enc_tail @@ -692,7 +692,7 @@ aes_hw_ecb_encrypt: movups %xmm6,64(%rsi) movups %xmm7,80(%rsi) jmp .Lecb_ret -#--------------------------- ECB DECRYPT ------------------------------# + .align 16 .Lecb_decrypt: cmpq $0x80,%rdx @@ -881,168 +881,6 @@ aes_hw_ecb_encrypt: .byte 0xf3,0xc3 .cfi_endproc .size aes_hw_ecb_encrypt,.-aes_hw_ecb_encrypt -.globl aes_hw_ccm64_encrypt_blocks -.hidden aes_hw_ccm64_encrypt_blocks -.type aes_hw_ccm64_encrypt_blocks,@function -.align 16 -aes_hw_ccm64_encrypt_blocks: - movl 240(%rcx),%eax - movdqu (%r8),%xmm6 - movdqa .Lincrement64(%rip),%xmm9 - movdqa .Lbswap_mask(%rip),%xmm7 - - shll $4,%eax - movl $16,%r10d - leaq 0(%rcx),%r11 - movdqu (%r9),%xmm3 - movdqa %xmm6,%xmm2 - leaq 32(%rcx,%rax,1),%rcx -.byte 102,15,56,0,247 - subq %rax,%r10 - jmp .Lccm64_enc_outer -.align 16 -.Lccm64_enc_outer: - movups (%r11),%xmm0 - movq %r10,%rax - movups (%rdi),%xmm8 - - xorps %xmm0,%xmm2 - movups 16(%r11),%xmm1 - xorps %xmm8,%xmm0 - xorps %xmm0,%xmm3 - movups 32(%r11),%xmm0 - -.Lccm64_enc2_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Lccm64_enc2_loop -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - paddq %xmm9,%xmm6 - decq %rdx -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 - - leaq 16(%rdi),%rdi - xorps %xmm2,%xmm8 - movdqa %xmm6,%xmm2 - movups %xmm8,(%rsi) -.byte 102,15,56,0,215 - leaq 16(%rsi),%rsi - jnz .Lccm64_enc_outer - - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - movups %xmm3,(%r9) - pxor %xmm3,%xmm3 - pxor %xmm8,%xmm8 - pxor %xmm6,%xmm6 - .byte 0xf3,0xc3 -.size aes_hw_ccm64_encrypt_blocks,.-aes_hw_ccm64_encrypt_blocks -.globl aes_hw_ccm64_decrypt_blocks -.hidden aes_hw_ccm64_decrypt_blocks -.type aes_hw_ccm64_decrypt_blocks,@function -.align 16 -aes_hw_ccm64_decrypt_blocks: - movl 240(%rcx),%eax - movups (%r8),%xmm6 - movdqu (%r9),%xmm3 - movdqa .Lincrement64(%rip),%xmm9 - movdqa .Lbswap_mask(%rip),%xmm7 - - movaps %xmm6,%xmm2 - movl %eax,%r10d - movq %rcx,%r11 -.byte 102,15,56,0,247 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_enc1_5: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_enc1_5 -.byte 102,15,56,221,209 - shll $4,%r10d - movl $16,%eax - movups (%rdi),%xmm8 - paddq %xmm9,%xmm6 - leaq 16(%rdi),%rdi - subq %r10,%rax - leaq 32(%r11,%r10,1),%rcx - movq %rax,%r10 - jmp .Lccm64_dec_outer -.align 16 -.Lccm64_dec_outer: - xorps %xmm2,%xmm8 - movdqa %xmm6,%xmm2 - movups %xmm8,(%rsi) - leaq 16(%rsi),%rsi -.byte 102,15,56,0,215 - - subq $1,%rdx - jz .Lccm64_dec_break - - movups (%r11),%xmm0 - movq %r10,%rax - movups 16(%r11),%xmm1 - xorps %xmm0,%xmm8 - xorps %xmm0,%xmm2 - xorps %xmm8,%xmm3 - movups 32(%r11),%xmm0 - jmp .Lccm64_dec2_loop -.align 16 -.Lccm64_dec2_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Lccm64_dec2_loop - movups (%rdi),%xmm8 - paddq %xmm9,%xmm6 -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 - leaq 16(%rdi),%rdi - jmp .Lccm64_dec_outer - -.align 16 -.Lccm64_dec_break: -#xorps %xmm8,%xmm3 - movl 240(%r11),%eax - movups (%r11),%xmm0 - movups 16(%r11),%xmm1 - xorps %xmm0,%xmm8 - leaq 32(%r11),%r11 - xorps %xmm8,%xmm3 -.Loop_enc1_6: -.byte 102,15,56,220,217 - decl %eax - movups (%r11),%xmm1 - leaq 16(%r11),%r11 - jnz .Loop_enc1_6 -.byte 102,15,56,221,217 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - movups %xmm3,(%r9) - pxor %xmm3,%xmm3 - pxor %xmm8,%xmm8 - pxor %xmm6,%xmm6 - .byte 0xf3,0xc3 -.size aes_hw_ccm64_decrypt_blocks,.-aes_hw_ccm64_decrypt_blocks .globl aes_hw_ctr32_encrypt_blocks .hidden aes_hw_ctr32_encrypt_blocks .type aes_hw_ctr32_encrypt_blocks,@function @@ -1066,12 +904,12 @@ aes_hw_ctr32_encrypt_blocks: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -.Loop_enc1_7: +.Loop_enc1_5: .byte 102,15,56,220,209 decl %edx movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_7 + jnz .Loop_enc1_5 .byte 102,15,56,221,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 @@ -1456,7 +1294,7 @@ aes_hw_ctr32_encrypt_blocks: movdqa 64(%rsp),%xmm15 .byte 102,68,15,56,221,193 movdqa 80(%rsp),%xmm0 - movups 16-128(%rcx),%xmm1#real 1st-round key + movups 16-128(%rcx),%xmm1 .byte 102,69,15,56,221,202 movups %xmm2,(%rsi) @@ -1628,1906 +1466,73 @@ aes_hw_ctr32_encrypt_blocks: .byte 0xf3,0xc3 .cfi_endproc .size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks -.globl aes_hw_xts_encrypt -.hidden aes_hw_xts_encrypt -.type aes_hw_xts_encrypt,@function +.globl aes_hw_cbc_encrypt +.hidden aes_hw_cbc_encrypt +.type aes_hw_cbc_encrypt,@function .align 16 -aes_hw_xts_encrypt: +aes_hw_cbc_encrypt: .cfi_startproc - leaq (%rsp),%r11 -.cfi_def_cfa_register %r11 - pushq %rbp -.cfi_offset %rbp,-16 - subq $112,%rsp - andq $-16,%rsp - movups (%r9),%xmm2 - movl 240(%r8),%eax + testq %rdx,%rdx + jz .Lcbc_ret + movl 240(%rcx),%r10d - movups (%r8),%xmm0 - movups 16(%r8),%xmm1 - leaq 32(%r8),%r8 - xorps %xmm0,%xmm2 -.Loop_enc1_8: + movq %rcx,%r11 + testl %r9d,%r9d + jz .Lcbc_decrypt + + movups (%r8),%xmm2 + movl %r10d,%eax + cmpq $16,%rdx + jb .Lcbc_enc_tail + subq $16,%rdx + jmp .Lcbc_enc_loop +.align 16 +.Lcbc_enc_loop: + movups (%rdi),%xmm3 + leaq 16(%rdi),%rdi + + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm3 + leaq 32(%rcx),%rcx + xorps %xmm3,%xmm2 +.Loop_enc1_6: .byte 102,15,56,220,209 decl %eax - movups (%r8),%xmm1 - leaq 16(%r8),%r8 - jnz .Loop_enc1_8 + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_enc1_6 .byte 102,15,56,221,209 - movups (%rcx),%xmm0 - movq %rcx,%rbp movl %r10d,%eax - shll $4,%r10d - movq %rdx,%r9 - andq $-16,%rdx - - movups 16(%rcx,%r10,1),%xmm1 - - movdqa .Lxts_magic(%rip),%xmm8 - movdqa %xmm2,%xmm15 - pshufd $0x5f,%xmm2,%xmm9 - pxor %xmm0,%xmm1 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm10 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm10 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm11 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm11 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm12 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm12 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm13 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm13 - pxor %xmm14,%xmm15 - movdqa %xmm15,%xmm14 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pxor %xmm0,%xmm14 - pxor %xmm9,%xmm15 - movaps %xmm1,96(%rsp) - - subq $96,%rdx - jc .Lxts_enc_short - - movl $16+96,%eax - leaq 32(%rbp,%r10,1),%rcx - subq %r10,%rax - movups 16(%rbp),%xmm1 - movq %rax,%r10 - leaq .Lxts_magic(%rip),%r8 - jmp .Lxts_enc_grandloop - -.align 32 -.Lxts_enc_grandloop: - movdqu 0(%rdi),%xmm2 - movdqa %xmm0,%xmm8 - movdqu 16(%rdi),%xmm3 - pxor %xmm10,%xmm2 - movdqu 32(%rdi),%xmm4 - pxor %xmm11,%xmm3 -.byte 102,15,56,220,209 - movdqu 48(%rdi),%xmm5 - pxor %xmm12,%xmm4 -.byte 102,15,56,220,217 - movdqu 64(%rdi),%xmm6 - pxor %xmm13,%xmm5 -.byte 102,15,56,220,225 - movdqu 80(%rdi),%xmm7 - pxor %xmm15,%xmm8 - movdqa 96(%rsp),%xmm9 - pxor %xmm14,%xmm6 -.byte 102,15,56,220,233 - movups 32(%rbp),%xmm0 - leaq 96(%rdi),%rdi - pxor %xmm8,%xmm7 - - pxor %xmm9,%xmm10 -.byte 102,15,56,220,241 - pxor %xmm9,%xmm11 - movdqa %xmm10,0(%rsp) -.byte 102,15,56,220,249 - movups 48(%rbp),%xmm1 - pxor %xmm9,%xmm12 - -.byte 102,15,56,220,208 - pxor %xmm9,%xmm13 - movdqa %xmm11,16(%rsp) -.byte 102,15,56,220,216 - pxor %xmm9,%xmm14 - movdqa %xmm12,32(%rsp) -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - pxor %xmm9,%xmm8 - movdqa %xmm14,64(%rsp) -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups 64(%rbp),%xmm0 - movdqa %xmm8,80(%rsp) - pshufd $0x5f,%xmm15,%xmm9 - jmp .Lxts_enc_loop6 -.align 32 -.Lxts_enc_loop6: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups -64(%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups -80(%rcx,%rax,1),%xmm0 - jnz .Lxts_enc_loop6 - - movdqa (%r8),%xmm8 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 -.byte 102,15,56,220,209 - paddq %xmm15,%xmm15 - psrad $31,%xmm14 -.byte 102,15,56,220,217 - pand %xmm8,%xmm14 - movups (%rbp),%xmm10 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 - pxor %xmm14,%xmm15 - movaps %xmm10,%xmm11 -.byte 102,15,56,220,249 - movups -64(%rcx),%xmm1 - - movdqa %xmm9,%xmm14 -.byte 102,15,56,220,208 - paddd %xmm9,%xmm9 - pxor %xmm15,%xmm10 -.byte 102,15,56,220,216 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - pand %xmm8,%xmm14 - movaps %xmm11,%xmm12 -.byte 102,15,56,220,240 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 -.byte 102,15,56,220,248 - movups -48(%rcx),%xmm0 - - paddd %xmm9,%xmm9 -.byte 102,15,56,220,209 - pxor %xmm15,%xmm11 - psrad $31,%xmm14 -.byte 102,15,56,220,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movdqa %xmm13,48(%rsp) - pxor %xmm14,%xmm15 -.byte 102,15,56,220,241 - movaps %xmm12,%xmm13 - movdqa %xmm9,%xmm14 -.byte 102,15,56,220,249 - movups -32(%rcx),%xmm1 - - paddd %xmm9,%xmm9 -.byte 102,15,56,220,208 - pxor %xmm15,%xmm12 - psrad $31,%xmm14 -.byte 102,15,56,220,216 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 - pxor %xmm14,%xmm15 - movaps %xmm13,%xmm14 -.byte 102,15,56,220,248 - - movdqa %xmm9,%xmm0 - paddd %xmm9,%xmm9 -.byte 102,15,56,220,209 - pxor %xmm15,%xmm13 - psrad $31,%xmm0 -.byte 102,15,56,220,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm0 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - pxor %xmm0,%xmm15 - movups (%rbp),%xmm0 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups 16(%rbp),%xmm1 - - pxor %xmm15,%xmm14 -.byte 102,15,56,221,84,36,0 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 -.byte 102,15,56,221,92,36,16 -.byte 102,15,56,221,100,36,32 - pand %xmm8,%xmm9 - movq %r10,%rax -.byte 102,15,56,221,108,36,48 -.byte 102,15,56,221,116,36,64 -.byte 102,15,56,221,124,36,80 - pxor %xmm9,%xmm15 - - leaq 96(%rsi),%rsi - movups %xmm2,-96(%rsi) - movups %xmm3,-80(%rsi) - movups %xmm4,-64(%rsi) - movups %xmm5,-48(%rsi) - movups %xmm6,-32(%rsi) - movups %xmm7,-16(%rsi) - subq $96,%rdx - jnc .Lxts_enc_grandloop - - movl $16+96,%eax - subl %r10d,%eax - movq %rbp,%rcx - shrl $4,%eax + movq %r11,%rcx + movups %xmm2,0(%rsi) + leaq 16(%rsi),%rsi + subq $16,%rdx + jnc .Lcbc_enc_loop + addq $16,%rdx + jnz .Lcbc_enc_tail + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm2,(%r8) + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + jmp .Lcbc_ret -.Lxts_enc_short: +.Lcbc_enc_tail: + movq %rdx,%rcx + xchgq %rdi,%rsi +.long 0x9066A4F3 + movl $16,%ecx + subq %rdx,%rcx + xorl %eax,%eax +.long 0x9066AAF3 + leaq -16(%rdi),%rdi + movl %r10d,%eax + movq %rdi,%rsi + movq %r11,%rcx + xorq %rdx,%rdx + jmp .Lcbc_enc_loop - movl %eax,%r10d - pxor %xmm0,%xmm10 - addq $96,%rdx - jz .Lxts_enc_done - - pxor %xmm0,%xmm11 - cmpq $0x20,%rdx - jb .Lxts_enc_one - pxor %xmm0,%xmm12 - je .Lxts_enc_two - - pxor %xmm0,%xmm13 - cmpq $0x40,%rdx - jb .Lxts_enc_three - pxor %xmm0,%xmm14 - je .Lxts_enc_four - - movdqu (%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 - pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 - leaq 80(%rdi),%rdi - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm6 - pxor %xmm7,%xmm7 - - call _aesni_encrypt6 - - xorps %xmm10,%xmm2 - movdqa %xmm15,%xmm10 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - xorps %xmm14,%xmm6 - movdqu %xmm4,32(%rsi) - movdqu %xmm5,48(%rsi) - movdqu %xmm6,64(%rsi) - leaq 80(%rsi),%rsi - jmp .Lxts_enc_done - -.align 16 -.Lxts_enc_one: - movups (%rdi),%xmm2 - leaq 16(%rdi),%rdi - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_enc1_9: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_enc1_9 -.byte 102,15,56,221,209 - xorps %xmm10,%xmm2 - movdqa %xmm11,%xmm10 - movups %xmm2,(%rsi) - leaq 16(%rsi),%rsi - jmp .Lxts_enc_done - -.align 16 -.Lxts_enc_two: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - leaq 32(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - - call _aesni_encrypt2 - - xorps %xmm10,%xmm2 - movdqa %xmm12,%xmm10 - xorps %xmm11,%xmm3 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - leaq 32(%rsi),%rsi - jmp .Lxts_enc_done - -.align 16 -.Lxts_enc_three: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - leaq 48(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - - call _aesni_encrypt3 - - xorps %xmm10,%xmm2 - movdqa %xmm13,%xmm10 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - leaq 48(%rsi),%rsi - jmp .Lxts_enc_done - -.align 16 -.Lxts_enc_four: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - xorps %xmm10,%xmm2 - movups 48(%rdi),%xmm5 - leaq 64(%rdi),%rdi - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - xorps %xmm13,%xmm5 - - call _aesni_encrypt4 - - pxor %xmm10,%xmm2 - movdqa %xmm14,%xmm10 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - pxor %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - movdqu %xmm4,32(%rsi) - movdqu %xmm5,48(%rsi) - leaq 64(%rsi),%rsi - jmp .Lxts_enc_done - -.align 16 -.Lxts_enc_done: - andq $15,%r9 - jz .Lxts_enc_ret - movq %r9,%rdx - -.Lxts_enc_steal: - movzbl (%rdi),%eax - movzbl -16(%rsi),%ecx - leaq 1(%rdi),%rdi - movb %al,-16(%rsi) - movb %cl,0(%rsi) - leaq 1(%rsi),%rsi - subq $1,%rdx - jnz .Lxts_enc_steal - - subq %r9,%rsi - movq %rbp,%rcx - movl %r10d,%eax - - movups -16(%rsi),%xmm2 - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_enc1_10: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_enc1_10 -.byte 102,15,56,221,209 - xorps %xmm10,%xmm2 - movups %xmm2,-16(%rsi) - -.Lxts_enc_ret: - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - movaps %xmm0,0(%rsp) - pxor %xmm8,%xmm8 - movaps %xmm0,16(%rsp) - pxor %xmm9,%xmm9 - movaps %xmm0,32(%rsp) - pxor %xmm10,%xmm10 - movaps %xmm0,48(%rsp) - pxor %xmm11,%xmm11 - movaps %xmm0,64(%rsp) - pxor %xmm12,%xmm12 - movaps %xmm0,80(%rsp) - pxor %xmm13,%xmm13 - movaps %xmm0,96(%rsp) - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - movq -8(%r11),%rbp -.cfi_restore %rbp - leaq (%r11),%rsp -.cfi_def_cfa_register %rsp -.Lxts_enc_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size aes_hw_xts_encrypt,.-aes_hw_xts_encrypt -.globl aes_hw_xts_decrypt -.hidden aes_hw_xts_decrypt -.type aes_hw_xts_decrypt,@function -.align 16 -aes_hw_xts_decrypt: -.cfi_startproc - leaq (%rsp),%r11 -.cfi_def_cfa_register %r11 - pushq %rbp -.cfi_offset %rbp,-16 - subq $112,%rsp - andq $-16,%rsp - movups (%r9),%xmm2 - movl 240(%r8),%eax - movl 240(%rcx),%r10d - movups (%r8),%xmm0 - movups 16(%r8),%xmm1 - leaq 32(%r8),%r8 - xorps %xmm0,%xmm2 -.Loop_enc1_11: -.byte 102,15,56,220,209 - decl %eax - movups (%r8),%xmm1 - leaq 16(%r8),%r8 - jnz .Loop_enc1_11 -.byte 102,15,56,221,209 - xorl %eax,%eax - testq $15,%rdx - setnz %al - shlq $4,%rax - subq %rax,%rdx - - movups (%rcx),%xmm0 - movq %rcx,%rbp - movl %r10d,%eax - shll $4,%r10d - movq %rdx,%r9 - andq $-16,%rdx - - movups 16(%rcx,%r10,1),%xmm1 - - movdqa .Lxts_magic(%rip),%xmm8 - movdqa %xmm2,%xmm15 - pshufd $0x5f,%xmm2,%xmm9 - pxor %xmm0,%xmm1 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm10 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm10 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm11 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm11 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm12 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm12 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm13 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm13 - pxor %xmm14,%xmm15 - movdqa %xmm15,%xmm14 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pxor %xmm0,%xmm14 - pxor %xmm9,%xmm15 - movaps %xmm1,96(%rsp) - - subq $96,%rdx - jc .Lxts_dec_short - - movl $16+96,%eax - leaq 32(%rbp,%r10,1),%rcx - subq %r10,%rax - movups 16(%rbp),%xmm1 - movq %rax,%r10 - leaq .Lxts_magic(%rip),%r8 - jmp .Lxts_dec_grandloop - -.align 32 -.Lxts_dec_grandloop: - movdqu 0(%rdi),%xmm2 - movdqa %xmm0,%xmm8 - movdqu 16(%rdi),%xmm3 - pxor %xmm10,%xmm2 - movdqu 32(%rdi),%xmm4 - pxor %xmm11,%xmm3 -.byte 102,15,56,222,209 - movdqu 48(%rdi),%xmm5 - pxor %xmm12,%xmm4 -.byte 102,15,56,222,217 - movdqu 64(%rdi),%xmm6 - pxor %xmm13,%xmm5 -.byte 102,15,56,222,225 - movdqu 80(%rdi),%xmm7 - pxor %xmm15,%xmm8 - movdqa 96(%rsp),%xmm9 - pxor %xmm14,%xmm6 -.byte 102,15,56,222,233 - movups 32(%rbp),%xmm0 - leaq 96(%rdi),%rdi - pxor %xmm8,%xmm7 - - pxor %xmm9,%xmm10 -.byte 102,15,56,222,241 - pxor %xmm9,%xmm11 - movdqa %xmm10,0(%rsp) -.byte 102,15,56,222,249 - movups 48(%rbp),%xmm1 - pxor %xmm9,%xmm12 - -.byte 102,15,56,222,208 - pxor %xmm9,%xmm13 - movdqa %xmm11,16(%rsp) -.byte 102,15,56,222,216 - pxor %xmm9,%xmm14 - movdqa %xmm12,32(%rsp) -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - pxor %xmm9,%xmm8 - movdqa %xmm14,64(%rsp) -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups 64(%rbp),%xmm0 - movdqa %xmm8,80(%rsp) - pshufd $0x5f,%xmm15,%xmm9 - jmp .Lxts_dec_loop6 -.align 32 -.Lxts_dec_loop6: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups -64(%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups -80(%rcx,%rax,1),%xmm0 - jnz .Lxts_dec_loop6 - - movdqa (%r8),%xmm8 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 -.byte 102,15,56,222,209 - paddq %xmm15,%xmm15 - psrad $31,%xmm14 -.byte 102,15,56,222,217 - pand %xmm8,%xmm14 - movups (%rbp),%xmm10 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 - pxor %xmm14,%xmm15 - movaps %xmm10,%xmm11 -.byte 102,15,56,222,249 - movups -64(%rcx),%xmm1 - - movdqa %xmm9,%xmm14 -.byte 102,15,56,222,208 - paddd %xmm9,%xmm9 - pxor %xmm15,%xmm10 -.byte 102,15,56,222,216 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - pand %xmm8,%xmm14 - movaps %xmm11,%xmm12 -.byte 102,15,56,222,240 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 -.byte 102,15,56,222,248 - movups -48(%rcx),%xmm0 - - paddd %xmm9,%xmm9 -.byte 102,15,56,222,209 - pxor %xmm15,%xmm11 - psrad $31,%xmm14 -.byte 102,15,56,222,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movdqa %xmm13,48(%rsp) - pxor %xmm14,%xmm15 -.byte 102,15,56,222,241 - movaps %xmm12,%xmm13 - movdqa %xmm9,%xmm14 -.byte 102,15,56,222,249 - movups -32(%rcx),%xmm1 - - paddd %xmm9,%xmm9 -.byte 102,15,56,222,208 - pxor %xmm15,%xmm12 - psrad $31,%xmm14 -.byte 102,15,56,222,216 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 - pxor %xmm14,%xmm15 - movaps %xmm13,%xmm14 -.byte 102,15,56,222,248 - - movdqa %xmm9,%xmm0 - paddd %xmm9,%xmm9 -.byte 102,15,56,222,209 - pxor %xmm15,%xmm13 - psrad $31,%xmm0 -.byte 102,15,56,222,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm0 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - pxor %xmm0,%xmm15 - movups (%rbp),%xmm0 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups 16(%rbp),%xmm1 - - pxor %xmm15,%xmm14 -.byte 102,15,56,223,84,36,0 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 -.byte 102,15,56,223,92,36,16 -.byte 102,15,56,223,100,36,32 - pand %xmm8,%xmm9 - movq %r10,%rax -.byte 102,15,56,223,108,36,48 -.byte 102,15,56,223,116,36,64 -.byte 102,15,56,223,124,36,80 - pxor %xmm9,%xmm15 - - leaq 96(%rsi),%rsi - movups %xmm2,-96(%rsi) - movups %xmm3,-80(%rsi) - movups %xmm4,-64(%rsi) - movups %xmm5,-48(%rsi) - movups %xmm6,-32(%rsi) - movups %xmm7,-16(%rsi) - subq $96,%rdx - jnc .Lxts_dec_grandloop - - movl $16+96,%eax - subl %r10d,%eax - movq %rbp,%rcx - shrl $4,%eax - -.Lxts_dec_short: - - movl %eax,%r10d - pxor %xmm0,%xmm10 - pxor %xmm0,%xmm11 - addq $96,%rdx - jz .Lxts_dec_done - - pxor %xmm0,%xmm12 - cmpq $0x20,%rdx - jb .Lxts_dec_one - pxor %xmm0,%xmm13 - je .Lxts_dec_two - - pxor %xmm0,%xmm14 - cmpq $0x40,%rdx - jb .Lxts_dec_three - je .Lxts_dec_four - - movdqu (%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 - pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 - leaq 80(%rdi),%rdi - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm6 - - call _aesni_decrypt6 - - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - xorps %xmm14,%xmm6 - movdqu %xmm4,32(%rsi) - pxor %xmm14,%xmm14 - movdqu %xmm5,48(%rsi) - pcmpgtd %xmm15,%xmm14 - movdqu %xmm6,64(%rsi) - leaq 80(%rsi),%rsi - pshufd $0x13,%xmm14,%xmm11 - andq $15,%r9 - jz .Lxts_dec_ret - - movdqa %xmm15,%xmm10 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm11 - pxor %xmm15,%xmm11 - jmp .Lxts_dec_done2 - -.align 16 -.Lxts_dec_one: - movups (%rdi),%xmm2 - leaq 16(%rdi),%rdi - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_dec1_12: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_dec1_12 -.byte 102,15,56,223,209 - xorps %xmm10,%xmm2 - movdqa %xmm11,%xmm10 - movups %xmm2,(%rsi) - movdqa %xmm12,%xmm11 - leaq 16(%rsi),%rsi - jmp .Lxts_dec_done - -.align 16 -.Lxts_dec_two: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - leaq 32(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - - call _aesni_decrypt2 - - xorps %xmm10,%xmm2 - movdqa %xmm12,%xmm10 - xorps %xmm11,%xmm3 - movdqa %xmm13,%xmm11 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - leaq 32(%rsi),%rsi - jmp .Lxts_dec_done - -.align 16 -.Lxts_dec_three: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - leaq 48(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - - call _aesni_decrypt3 - - xorps %xmm10,%xmm2 - movdqa %xmm13,%xmm10 - xorps %xmm11,%xmm3 - movdqa %xmm14,%xmm11 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - leaq 48(%rsi),%rsi - jmp .Lxts_dec_done - -.align 16 -.Lxts_dec_four: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - xorps %xmm10,%xmm2 - movups 48(%rdi),%xmm5 - leaq 64(%rdi),%rdi - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - xorps %xmm13,%xmm5 - - call _aesni_decrypt4 - - pxor %xmm10,%xmm2 - movdqa %xmm14,%xmm10 - pxor %xmm11,%xmm3 - movdqa %xmm15,%xmm11 - pxor %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - pxor %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - movdqu %xmm4,32(%rsi) - movdqu %xmm5,48(%rsi) - leaq 64(%rsi),%rsi - jmp .Lxts_dec_done - -.align 16 -.Lxts_dec_done: - andq $15,%r9 - jz .Lxts_dec_ret -.Lxts_dec_done2: - movq %r9,%rdx - movq %rbp,%rcx - movl %r10d,%eax - - movups (%rdi),%xmm2 - xorps %xmm11,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_dec1_13: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_dec1_13 -.byte 102,15,56,223,209 - xorps %xmm11,%xmm2 - movups %xmm2,(%rsi) - -.Lxts_dec_steal: - movzbl 16(%rdi),%eax - movzbl (%rsi),%ecx - leaq 1(%rdi),%rdi - movb %al,(%rsi) - movb %cl,16(%rsi) - leaq 1(%rsi),%rsi - subq $1,%rdx - jnz .Lxts_dec_steal - - subq %r9,%rsi - movq %rbp,%rcx - movl %r10d,%eax - - movups (%rsi),%xmm2 - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_dec1_14: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_dec1_14 -.byte 102,15,56,223,209 - xorps %xmm10,%xmm2 - movups %xmm2,(%rsi) - -.Lxts_dec_ret: - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - movaps %xmm0,0(%rsp) - pxor %xmm8,%xmm8 - movaps %xmm0,16(%rsp) - pxor %xmm9,%xmm9 - movaps %xmm0,32(%rsp) - pxor %xmm10,%xmm10 - movaps %xmm0,48(%rsp) - pxor %xmm11,%xmm11 - movaps %xmm0,64(%rsp) - pxor %xmm12,%xmm12 - movaps %xmm0,80(%rsp) - pxor %xmm13,%xmm13 - movaps %xmm0,96(%rsp) - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - movq -8(%r11),%rbp -.cfi_restore %rbp - leaq (%r11),%rsp -.cfi_def_cfa_register %rsp -.Lxts_dec_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size aes_hw_xts_decrypt,.-aes_hw_xts_decrypt -.globl aes_hw_ocb_encrypt -.hidden aes_hw_ocb_encrypt -.type aes_hw_ocb_encrypt,@function -.align 32 -aes_hw_ocb_encrypt: -.cfi_startproc - leaq (%rsp),%rax - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - movq 8(%rax),%rbx - movq 8+8(%rax),%rbp - - movl 240(%rcx),%r10d - movq %rcx,%r11 - shll $4,%r10d - movups (%rcx),%xmm9 - movups 16(%rcx,%r10,1),%xmm1 - - movdqu (%r9),%xmm15 - pxor %xmm1,%xmm9 - pxor %xmm1,%xmm15 - - movl $16+32,%eax - leaq 32(%r11,%r10,1),%rcx - movups 16(%r11),%xmm1 - subq %r10,%rax - movq %rax,%r10 - - movdqu (%rbx),%xmm10 - movdqu (%rbp),%xmm8 - - testq $1,%r8 - jnz .Locb_enc_odd - - bsfq %r8,%r12 - addq $1,%r8 - shlq $4,%r12 - movdqu (%rbx,%r12,1),%xmm7 - movdqu (%rdi),%xmm2 - leaq 16(%rdi),%rdi - - call __ocb_encrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,(%rsi) - leaq 16(%rsi),%rsi - subq $1,%rdx - jz .Locb_enc_done - -.Locb_enc_odd: - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - leaq 6(%r8),%r8 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - shlq $4,%r12 - shlq $4,%r13 - shlq $4,%r14 - - subq $6,%rdx - jc .Locb_enc_short - jmp .Locb_enc_grandloop - -.align 32 -.Locb_enc_grandloop: - movdqu 0(%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - movdqu 48(%rdi),%xmm5 - movdqu 64(%rdi),%xmm6 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi - - call __ocb_encrypt6 - - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) - leaq 96(%rsi),%rsi - subq $6,%rdx - jnc .Locb_enc_grandloop - -.Locb_enc_short: - addq $6,%rdx - jz .Locb_enc_done - - movdqu 0(%rdi),%xmm2 - cmpq $2,%rdx - jb .Locb_enc_one - movdqu 16(%rdi),%xmm3 - je .Locb_enc_two - - movdqu 32(%rdi),%xmm4 - cmpq $4,%rdx - jb .Locb_enc_three - movdqu 48(%rdi),%xmm5 - je .Locb_enc_four - - movdqu 64(%rdi),%xmm6 - pxor %xmm7,%xmm7 - - call __ocb_encrypt6 - - movdqa %xmm14,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - - jmp .Locb_enc_done - -.align 16 -.Locb_enc_one: - movdqa %xmm10,%xmm7 - - call __ocb_encrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,0(%rsi) - jmp .Locb_enc_done - -.align 16 -.Locb_enc_two: - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - - call __ocb_encrypt4 - - movdqa %xmm11,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - - jmp .Locb_enc_done - -.align 16 -.Locb_enc_three: - pxor %xmm5,%xmm5 - - call __ocb_encrypt4 - - movdqa %xmm12,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - - jmp .Locb_enc_done - -.align 16 -.Locb_enc_four: - call __ocb_encrypt4 - - movdqa %xmm13,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - -.Locb_enc_done: - pxor %xmm0,%xmm15 - movdqu %xmm8,(%rbp) - movdqu %xmm15,(%r9) - - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - pxor %xmm8,%xmm8 - pxor %xmm9,%xmm9 - pxor %xmm10,%xmm10 - pxor %xmm11,%xmm11 - pxor %xmm12,%xmm12 - pxor %xmm13,%xmm13 - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - leaq 40(%rsp),%rax -.cfi_def_cfa %rax,8 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Locb_enc_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size aes_hw_ocb_encrypt,.-aes_hw_ocb_encrypt - -.type __ocb_encrypt6,@function -.align 32 -__ocb_encrypt6: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - movdqa %xmm10,%xmm14 - pxor %xmm15,%xmm10 - movdqu (%rbx,%r14,1),%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm2,%xmm8 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm3,%xmm8 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm4,%xmm8 - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm14 - pxor %xmm5,%xmm8 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm15 - pxor %xmm6,%xmm8 - pxor %xmm14,%xmm6 - pxor %xmm7,%xmm8 - pxor %xmm15,%xmm7 - movups 32(%r11),%xmm0 - - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - addq $6,%r8 - pxor %xmm9,%xmm10 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 -.byte 102,15,56,220,241 - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm14 -.byte 102,15,56,220,249 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm15 - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups 64(%r11),%xmm0 - shlq $4,%r12 - shlq $4,%r13 - jmp .Locb_enc_loop6 - -.align 32 -.Locb_enc_loop6: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Locb_enc_loop6 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups 16(%r11),%xmm1 - shlq $4,%r14 - -.byte 102,65,15,56,221,210 - movdqu (%rbx),%xmm10 - movq %r10,%rax -.byte 102,65,15,56,221,219 -.byte 102,65,15,56,221,228 -.byte 102,65,15,56,221,237 -.byte 102,65,15,56,221,246 -.byte 102,65,15,56,221,255 - .byte 0xf3,0xc3 -.size __ocb_encrypt6,.-__ocb_encrypt6 - -.type __ocb_encrypt4,@function -.align 32 -__ocb_encrypt4: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - pxor %xmm15,%xmm10 - pxor %xmm10,%xmm11 - pxor %xmm2,%xmm8 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm3,%xmm8 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm4,%xmm8 - pxor %xmm12,%xmm4 - pxor %xmm5,%xmm8 - pxor %xmm13,%xmm5 - movups 32(%r11),%xmm0 - - pxor %xmm9,%xmm10 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 - pxor %xmm9,%xmm13 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups 48(%r11),%xmm1 - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movups 64(%r11),%xmm0 - jmp .Locb_enc_loop4 - -.align 32 -.Locb_enc_loop4: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Locb_enc_loop4 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,65,15,56,221,210 -.byte 102,65,15,56,221,219 -.byte 102,65,15,56,221,228 -.byte 102,65,15,56,221,237 - .byte 0xf3,0xc3 -.size __ocb_encrypt4,.-__ocb_encrypt4 - -.type __ocb_encrypt1,@function -.align 32 -__ocb_encrypt1: - pxor %xmm15,%xmm7 - pxor %xmm9,%xmm7 - pxor %xmm2,%xmm8 - pxor %xmm7,%xmm2 - movups 32(%r11),%xmm0 - -.byte 102,15,56,220,209 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm7 - -.byte 102,15,56,220,208 - movups 64(%r11),%xmm0 - jmp .Locb_enc_loop1 - -.align 32 -.Locb_enc_loop1: -.byte 102,15,56,220,209 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Locb_enc_loop1 - -.byte 102,15,56,220,209 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,15,56,221,215 - .byte 0xf3,0xc3 -.size __ocb_encrypt1,.-__ocb_encrypt1 - -.globl aes_hw_ocb_decrypt -.hidden aes_hw_ocb_decrypt -.type aes_hw_ocb_decrypt,@function -.align 32 -aes_hw_ocb_decrypt: -.cfi_startproc - leaq (%rsp),%rax - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - movq 8(%rax),%rbx - movq 8+8(%rax),%rbp - - movl 240(%rcx),%r10d - movq %rcx,%r11 - shll $4,%r10d - movups (%rcx),%xmm9 - movups 16(%rcx,%r10,1),%xmm1 - - movdqu (%r9),%xmm15 - pxor %xmm1,%xmm9 - pxor %xmm1,%xmm15 - - movl $16+32,%eax - leaq 32(%r11,%r10,1),%rcx - movups 16(%r11),%xmm1 - subq %r10,%rax - movq %rax,%r10 - - movdqu (%rbx),%xmm10 - movdqu (%rbp),%xmm8 - - testq $1,%r8 - jnz .Locb_dec_odd - - bsfq %r8,%r12 - addq $1,%r8 - shlq $4,%r12 - movdqu (%rbx,%r12,1),%xmm7 - movdqu (%rdi),%xmm2 - leaq 16(%rdi),%rdi - - call __ocb_decrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,(%rsi) - xorps %xmm2,%xmm8 - leaq 16(%rsi),%rsi - subq $1,%rdx - jz .Locb_dec_done - -.Locb_dec_odd: - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - leaq 6(%r8),%r8 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - shlq $4,%r12 - shlq $4,%r13 - shlq $4,%r14 - - subq $6,%rdx - jc .Locb_dec_short - jmp .Locb_dec_grandloop - -.align 32 -.Locb_dec_grandloop: - movdqu 0(%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - movdqu 48(%rdi),%xmm5 - movdqu 64(%rdi),%xmm6 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi - - call __ocb_decrypt6 - - movups %xmm2,0(%rsi) - pxor %xmm2,%xmm8 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm8 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm8 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm8 - movups %xmm6,64(%rsi) - pxor %xmm6,%xmm8 - movups %xmm7,80(%rsi) - pxor %xmm7,%xmm8 - leaq 96(%rsi),%rsi - subq $6,%rdx - jnc .Locb_dec_grandloop - -.Locb_dec_short: - addq $6,%rdx - jz .Locb_dec_done - - movdqu 0(%rdi),%xmm2 - cmpq $2,%rdx - jb .Locb_dec_one - movdqu 16(%rdi),%xmm3 - je .Locb_dec_two - - movdqu 32(%rdi),%xmm4 - cmpq $4,%rdx - jb .Locb_dec_three - movdqu 48(%rdi),%xmm5 - je .Locb_dec_four - - movdqu 64(%rdi),%xmm6 - pxor %xmm7,%xmm7 - - call __ocb_decrypt6 - - movdqa %xmm14,%xmm15 - movups %xmm2,0(%rsi) - pxor %xmm2,%xmm8 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm8 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm8 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm8 - movups %xmm6,64(%rsi) - pxor %xmm6,%xmm8 - - jmp .Locb_dec_done - -.align 16 -.Locb_dec_one: - movdqa %xmm10,%xmm7 - - call __ocb_decrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,0(%rsi) - xorps %xmm2,%xmm8 - jmp .Locb_dec_done - -.align 16 -.Locb_dec_two: - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - - call __ocb_decrypt4 - - movdqa %xmm11,%xmm15 - movups %xmm2,0(%rsi) - xorps %xmm2,%xmm8 - movups %xmm3,16(%rsi) - xorps %xmm3,%xmm8 - - jmp .Locb_dec_done - -.align 16 -.Locb_dec_three: - pxor %xmm5,%xmm5 - - call __ocb_decrypt4 - - movdqa %xmm12,%xmm15 - movups %xmm2,0(%rsi) - xorps %xmm2,%xmm8 - movups %xmm3,16(%rsi) - xorps %xmm3,%xmm8 - movups %xmm4,32(%rsi) - xorps %xmm4,%xmm8 - - jmp .Locb_dec_done - -.align 16 -.Locb_dec_four: - call __ocb_decrypt4 - - movdqa %xmm13,%xmm15 - movups %xmm2,0(%rsi) - pxor %xmm2,%xmm8 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm8 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm8 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm8 - -.Locb_dec_done: - pxor %xmm0,%xmm15 - movdqu %xmm8,(%rbp) - movdqu %xmm15,(%r9) - - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - pxor %xmm8,%xmm8 - pxor %xmm9,%xmm9 - pxor %xmm10,%xmm10 - pxor %xmm11,%xmm11 - pxor %xmm12,%xmm12 - pxor %xmm13,%xmm13 - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - leaq 40(%rsp),%rax -.cfi_def_cfa %rax,8 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Locb_dec_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size aes_hw_ocb_decrypt,.-aes_hw_ocb_decrypt - -.type __ocb_decrypt6,@function -.align 32 -__ocb_decrypt6: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - movdqa %xmm10,%xmm14 - pxor %xmm15,%xmm10 - movdqu (%rbx,%r14,1),%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm14 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm15 - pxor %xmm14,%xmm6 - pxor %xmm15,%xmm7 - movups 32(%r11),%xmm0 - - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - addq $6,%r8 - pxor %xmm9,%xmm10 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 -.byte 102,15,56,222,241 - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm14 -.byte 102,15,56,222,249 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm15 - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups 64(%r11),%xmm0 - shlq $4,%r12 - shlq $4,%r13 - jmp .Locb_dec_loop6 - -.align 32 -.Locb_dec_loop6: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Locb_dec_loop6 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups 16(%r11),%xmm1 - shlq $4,%r14 - -.byte 102,65,15,56,223,210 - movdqu (%rbx),%xmm10 - movq %r10,%rax -.byte 102,65,15,56,223,219 -.byte 102,65,15,56,223,228 -.byte 102,65,15,56,223,237 -.byte 102,65,15,56,223,246 -.byte 102,65,15,56,223,255 - .byte 0xf3,0xc3 -.size __ocb_decrypt6,.-__ocb_decrypt6 - -.type __ocb_decrypt4,@function -.align 32 -__ocb_decrypt4: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - pxor %xmm15,%xmm10 - pxor %xmm10,%xmm11 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm5 - movups 32(%r11),%xmm0 - - pxor %xmm9,%xmm10 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 - pxor %xmm9,%xmm13 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups 48(%r11),%xmm1 - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - movups 64(%r11),%xmm0 - jmp .Locb_dec_loop4 - -.align 32 -.Locb_dec_loop4: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Locb_dec_loop4 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,65,15,56,223,210 -.byte 102,65,15,56,223,219 -.byte 102,65,15,56,223,228 -.byte 102,65,15,56,223,237 - .byte 0xf3,0xc3 -.size __ocb_decrypt4,.-__ocb_decrypt4 - -.type __ocb_decrypt1,@function -.align 32 -__ocb_decrypt1: - pxor %xmm15,%xmm7 - pxor %xmm9,%xmm7 - pxor %xmm7,%xmm2 - movups 32(%r11),%xmm0 - -.byte 102,15,56,222,209 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm7 - -.byte 102,15,56,222,208 - movups 64(%r11),%xmm0 - jmp .Locb_dec_loop1 - -.align 32 -.Locb_dec_loop1: -.byte 102,15,56,222,209 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Locb_dec_loop1 - -.byte 102,15,56,222,209 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,15,56,223,215 - .byte 0xf3,0xc3 -.size __ocb_decrypt1,.-__ocb_decrypt1 -.globl aes_hw_cbc_encrypt -.hidden aes_hw_cbc_encrypt -.type aes_hw_cbc_encrypt,@function -.align 16 -aes_hw_cbc_encrypt: -.cfi_startproc - testq %rdx,%rdx - jz .Lcbc_ret - - movl 240(%rcx),%r10d - movq %rcx,%r11 - testl %r9d,%r9d - jz .Lcbc_decrypt -#--------------------------- CBC ENCRYPT ------------------------------# - movups (%r8),%xmm2 - movl %r10d,%eax - cmpq $16,%rdx - jb .Lcbc_enc_tail - subq $16,%rdx - jmp .Lcbc_enc_loop -.align 16 -.Lcbc_enc_loop: - movups (%rdi),%xmm3 - leaq 16(%rdi),%rdi -#xorps %xmm3,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm3 - leaq 32(%rcx),%rcx - xorps %xmm3,%xmm2 -.Loop_enc1_15: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_enc1_15 -.byte 102,15,56,221,209 - movl %r10d,%eax - movq %r11,%rcx - movups %xmm2,0(%rsi) - leaq 16(%rsi),%rsi - subq $16,%rdx - jnc .Lcbc_enc_loop - addq $16,%rdx - jnz .Lcbc_enc_tail - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movups %xmm2,(%r8) - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - jmp .Lcbc_ret - -.Lcbc_enc_tail: - movq %rdx,%rcx - xchgq %rdi,%rsi -.long 0x9066A4F3 - movl $16,%ecx - subq %rdx,%rcx - xorl %eax,%eax -.long 0x9066AAF3 - leaq -16(%rdi),%rdi - movl %r10d,%eax - movq %rdi,%rsi - movq %r11,%rcx - xorq %rdx,%rdx - jmp .Lcbc_enc_loop -#--------------------------- CBC DECRYPT ------------------------------# .align 16 .Lcbc_decrypt: cmpq $16,%rdx @@ -3542,12 +1547,12 @@ aes_hw_cbc_encrypt: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -.Loop_dec1_16: +.Loop_dec1_7: .byte 102,15,56,222,209 decl %r10d movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_16 + jnz .Loop_dec1_7 .byte 102,15,56,223,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 @@ -3960,12 +1965,12 @@ aes_hw_cbc_encrypt: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -.Loop_dec1_17: +.Loop_dec1_8: .byte 102,15,56,222,209 decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_17 + jnz .Loop_dec1_8 .byte 102,15,56,223,209 xorps %xmm10,%xmm2 movaps %xmm11,%xmm10 diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S index 7b2038899c4bfe..5236aa66f6d374 100644 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S +++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S @@ -221,7 +221,7 @@ _bsaes_encrypt8_bitslice: pxor %xmm13,%xmm8 pxor %xmm14,%xmm7 -#Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 + @@ -692,7 +692,7 @@ _bsaes_decrypt8: pxor %xmm13,%xmm8 pxor %xmm14,%xmm7 -#Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 + @@ -1079,7 +1079,7 @@ _bsaes_key_convert: jnz .Lkey_loop movdqa 80(%r11),%xmm7 -#movdqa %xmm6, (%rax) + .byte 0xf3,0xc3 .cfi_endproc .size _bsaes_key_convert,.-_bsaes_key_convert diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S index 9543246194d549..0b36afac943b2c 100644 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S +++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S @@ -731,7 +731,7 @@ gcm_init_clmul: pshufd $255,%xmm2,%xmm4 movdqa %xmm2,%xmm3 psllq $1,%xmm2 - pxor %xmm5,%xmm5# + pxor %xmm5,%xmm5 psrlq $63,%xmm3 pcmpgtd %xmm4,%xmm5 pslldq $8,%xmm3 @@ -745,43 +745,43 @@ gcm_init_clmul: pshufd $78,%xmm2,%xmm6 movdqa %xmm2,%xmm0 pxor %xmm2,%xmm6 - movdqa %xmm0,%xmm1# + movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,222,0 - pxor %xmm0,%xmm3# - pxor %xmm1,%xmm3# + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 - movdqa %xmm3,%xmm4# + movdqa %xmm3,%xmm4 psrldq $8,%xmm3 - pslldq $8,%xmm4# + pslldq $8,%xmm4 pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0# + pxor %xmm4,%xmm0 - movdqa %xmm0,%xmm4# + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 psllq $1,%xmm0 - pxor %xmm3,%xmm0# - psllq $57,%xmm0# - movdqa %xmm0,%xmm3# + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm3# + psrldq $8,%xmm3 pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1# + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm1# + pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 - pxor %xmm4,%xmm0# - psrlq $1,%xmm0# - pxor %xmm1,%xmm0# + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 pshufd $78,%xmm2,%xmm3 pshufd $78,%xmm0,%xmm4 pxor %xmm2,%xmm3 @@ -790,81 +790,81 @@ gcm_init_clmul: movdqu %xmm0,16(%rdi) .byte 102,15,58,15,227,8 movdqu %xmm4,32(%rdi) - movdqa %xmm0,%xmm1# + movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,222,0 - pxor %xmm0,%xmm3# - pxor %xmm1,%xmm3# + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 - movdqa %xmm3,%xmm4# + movdqa %xmm3,%xmm4 psrldq $8,%xmm3 - pslldq $8,%xmm4# + pslldq $8,%xmm4 pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0# + pxor %xmm4,%xmm0 - movdqa %xmm0,%xmm4# + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 psllq $1,%xmm0 - pxor %xmm3,%xmm0# - psllq $57,%xmm0# - movdqa %xmm0,%xmm3# + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm3# + psrldq $8,%xmm3 pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1# + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm1# + pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 - pxor %xmm4,%xmm0# - psrlq $1,%xmm0# - pxor %xmm1,%xmm0# + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 movdqa %xmm0,%xmm5 - movdqa %xmm0,%xmm1# + movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,222,0 - pxor %xmm0,%xmm3# - pxor %xmm1,%xmm3# + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 - movdqa %xmm3,%xmm4# + movdqa %xmm3,%xmm4 psrldq $8,%xmm3 - pslldq $8,%xmm4# + pslldq $8,%xmm4 pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0# + pxor %xmm4,%xmm0 - movdqa %xmm0,%xmm4# + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 psllq $1,%xmm0 - pxor %xmm3,%xmm0# - psllq $57,%xmm0# - movdqa %xmm0,%xmm3# + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm3# + psrldq $8,%xmm3 pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1# + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm1# + pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 - pxor %xmm4,%xmm0# - psrlq $1,%xmm0# - pxor %xmm1,%xmm0# + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 pshufd $78,%xmm5,%xmm3 pshufd $78,%xmm0,%xmm4 pxor %xmm5,%xmm3 @@ -888,43 +888,43 @@ gcm_gmult_clmul: movdqu (%rsi),%xmm2 movdqu 32(%rsi),%xmm4 .byte 102,15,56,0,197 - movdqa %xmm0,%xmm1# + movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,220,0 - pxor %xmm0,%xmm3# - pxor %xmm1,%xmm3# + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 - movdqa %xmm3,%xmm4# + movdqa %xmm3,%xmm4 psrldq $8,%xmm3 - pslldq $8,%xmm4# + pslldq $8,%xmm4 pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0# + pxor %xmm4,%xmm0 - movdqa %xmm0,%xmm4# + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 psllq $1,%xmm0 - pxor %xmm3,%xmm0# - psllq $57,%xmm0# - movdqa %xmm0,%xmm3# + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm3# + psrldq $8,%xmm3 pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1# + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm1# + pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 - pxor %xmm4,%xmm0# - psrlq $1,%xmm0# - pxor %xmm1,%xmm0# + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 .byte 102,15,56,0,197 movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 @@ -962,9 +962,9 @@ gcm_ghash_clmul: movdqu 48(%rsi),%xmm14 movdqu 64(%rsi),%xmm15 -####### -# + + movdqu 48(%rdx),%xmm3 movdqu 32(%rdx),%xmm11 .byte 102,65,15,56,0,218 @@ -1031,28 +1031,28 @@ gcm_ghash_clmul: pxor %xmm0,%xmm8 movdqa %xmm3,%xmm5 - pxor %xmm1,%xmm8# + pxor %xmm1,%xmm8 pxor %xmm3,%xmm4 - movdqa %xmm8,%xmm9# + movdqa %xmm8,%xmm9 .byte 102,68,15,58,68,234,17 pslldq $8,%xmm8 - psrldq $8,%xmm9# + psrldq $8,%xmm9 pxor %xmm8,%xmm0 movdqa .L7_mask(%rip),%xmm8 - pxor %xmm9,%xmm1# + pxor %xmm9,%xmm1 .byte 102,76,15,110,200 pand %xmm0,%xmm8 .byte 102,69,15,56,0,200 - pxor %xmm0,%xmm9# + pxor %xmm0,%xmm9 .byte 102,68,15,58,68,231,0 - psllq $57,%xmm9# - movdqa %xmm9,%xmm8# + psllq $57,%xmm9 + movdqa %xmm9,%xmm8 pslldq $8,%xmm9 .byte 102,15,58,68,222,0 - psrldq $8,%xmm8# + psrldq $8,%xmm8 pxor %xmm9,%xmm0 - pxor %xmm8,%xmm1# + pxor %xmm8,%xmm1 movdqu 0(%rdx),%xmm8 movdqa %xmm0,%xmm9 @@ -1065,19 +1065,19 @@ gcm_ghash_clmul: xorps %xmm13,%xmm5 movups 80(%rsi),%xmm7 .byte 102,69,15,56,0,194 - pxor %xmm9,%xmm1# + pxor %xmm9,%xmm1 pxor %xmm0,%xmm9 psrlq $5,%xmm0 movdqa %xmm11,%xmm13 pxor %xmm12,%xmm4 pshufd $78,%xmm11,%xmm12 - pxor %xmm9,%xmm0# + pxor %xmm9,%xmm0 pxor %xmm8,%xmm1 pxor %xmm11,%xmm12 .byte 102,69,15,58,68,222,0 - psrlq $1,%xmm0# - pxor %xmm1,%xmm0# + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 movdqa %xmm0,%xmm1 .byte 102,69,15,58,68,238,17 xorps %xmm11,%xmm3 @@ -1101,48 +1101,48 @@ gcm_ghash_clmul: pxor %xmm0,%xmm1 pxor %xmm4,%xmm8 - pxor %xmm1,%xmm8# + pxor %xmm1,%xmm8 pxor %xmm0,%xmm1 - movdqa %xmm8,%xmm9# + movdqa %xmm8,%xmm9 psrldq $8,%xmm8 - pslldq $8,%xmm9# + pslldq $8,%xmm9 pxor %xmm8,%xmm1 - pxor %xmm9,%xmm0# + pxor %xmm9,%xmm0 - movdqa %xmm0,%xmm4# + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 psllq $1,%xmm0 - pxor %xmm3,%xmm0# - psllq $57,%xmm0# - movdqa %xmm0,%xmm3# + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm3# + psrldq $8,%xmm3 pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1# + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm1# + pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 - pxor %xmm4,%xmm0# - psrlq $1,%xmm0# - pxor %xmm1,%xmm0# + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 addq $0x40,%rcx jz .Ldone movdqu 32(%rsi),%xmm7 subq $0x10,%rcx jz .Lodd_tail .Lskip4x: -####### -# [(H*Ii+1) + (H*Xi+1)] mod P = -# [(H*Ii+1) + H^2*(Ii+Xi)] mod P -# + + + + movdqu (%rdx),%xmm8 movdqu 16(%rdx),%xmm3 .byte 102,69,15,56,0,194 @@ -1167,8 +1167,8 @@ gcm_ghash_clmul: .Lmod_loop: movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm8 - pshufd $78,%xmm0,%xmm4# - pxor %xmm0,%xmm4# + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 .byte 102,15,58,68,198,0 .byte 102,15,58,68,206,17 @@ -1185,41 +1185,41 @@ gcm_ghash_clmul: pxor %xmm9,%xmm1 pxor %xmm8,%xmm4 .byte 102,65,15,56,0,218 - movdqa %xmm4,%xmm8# + movdqa %xmm4,%xmm8 psrldq $8,%xmm8 - pslldq $8,%xmm4# + pslldq $8,%xmm4 pxor %xmm8,%xmm1 - pxor %xmm4,%xmm0# + pxor %xmm4,%xmm0 - movdqa %xmm3,%xmm5# + movdqa %xmm3,%xmm5 movdqa %xmm0,%xmm9 movdqa %xmm0,%xmm8 psllq $5,%xmm0 - pxor %xmm0,%xmm8# + pxor %xmm0,%xmm8 .byte 102,15,58,68,218,0 psllq $1,%xmm0 - pxor %xmm8,%xmm0# - psllq $57,%xmm0# - movdqa %xmm0,%xmm8# + pxor %xmm8,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm8 pslldq $8,%xmm0 - psrldq $8,%xmm8# + psrldq $8,%xmm8 pxor %xmm9,%xmm0 pshufd $78,%xmm5,%xmm4 - pxor %xmm8,%xmm1# - pxor %xmm5,%xmm4# + pxor %xmm8,%xmm1 + pxor %xmm5,%xmm4 movdqa %xmm0,%xmm9 psrlq $1,%xmm0 .byte 102,15,58,68,234,17 - pxor %xmm9,%xmm1# + pxor %xmm9,%xmm1 pxor %xmm0,%xmm9 psrlq $5,%xmm0 - pxor %xmm9,%xmm0# + pxor %xmm9,%xmm0 leaq 32(%rdx),%rdx - psrlq $1,%xmm0# + psrlq $1,%xmm0 .byte 102,15,58,68,231,0 - pxor %xmm1,%xmm0# + pxor %xmm1,%xmm0 subq $0x20,%rcx ja .Lmod_loop @@ -1227,8 +1227,8 @@ gcm_ghash_clmul: .Leven_tail: movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm8 - pshufd $78,%xmm0,%xmm4# - pxor %xmm0,%xmm4# + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 .byte 102,15,58,68,198,0 .byte 102,15,58,68,206,17 @@ -1239,34 +1239,34 @@ gcm_ghash_clmul: pxor %xmm0,%xmm8 pxor %xmm1,%xmm8 pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm8# + movdqa %xmm4,%xmm8 psrldq $8,%xmm8 - pslldq $8,%xmm4# + pslldq $8,%xmm4 pxor %xmm8,%xmm1 - pxor %xmm4,%xmm0# + pxor %xmm4,%xmm0 - movdqa %xmm0,%xmm4# + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 psllq $1,%xmm0 - pxor %xmm3,%xmm0# - psllq $57,%xmm0# - movdqa %xmm0,%xmm3# + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm3# + psrldq $8,%xmm3 pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1# + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm1# + pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 - pxor %xmm4,%xmm0# - psrlq $1,%xmm0# - pxor %xmm1,%xmm0# + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 testq %rcx,%rcx jnz .Ldone @@ -1274,43 +1274,43 @@ gcm_ghash_clmul: movdqu (%rdx),%xmm8 .byte 102,69,15,56,0,194 pxor %xmm8,%xmm0 - movdqa %xmm0,%xmm1# + movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,223,0 - pxor %xmm0,%xmm3# - pxor %xmm1,%xmm3# + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 - movdqa %xmm3,%xmm4# + movdqa %xmm3,%xmm4 psrldq $8,%xmm3 - pslldq $8,%xmm4# + pslldq $8,%xmm4 pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0# + pxor %xmm4,%xmm0 - movdqa %xmm0,%xmm4# + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 psllq $1,%xmm0 - pxor %xmm3,%xmm0# - psllq $57,%xmm0# - movdqa %xmm0,%xmm3# + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm3# + psrldq $8,%xmm3 pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1# + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm1# + pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 - pxor %xmm4,%xmm0# - psrlq $1,%xmm0# - pxor %xmm1,%xmm0# + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 .Ldone: .byte 102,65,15,56,0,194 movdqu %xmm0,(%rdi) @@ -1332,7 +1332,7 @@ gcm_init_avx: vpshufd $255,%xmm2,%xmm4 vpsrlq $63,%xmm2,%xmm3 vpsllq $1,%xmm2,%xmm2 - vpxor %xmm5,%xmm5,%xmm5# + vpxor %xmm5,%xmm5,%xmm5 vpcmpgtd %xmm4,%xmm5,%xmm5 vpslldq $8,%xmm3,%xmm3 vpor %xmm3,%xmm2,%xmm2 @@ -1351,65 +1351,65 @@ gcm_init_avx: vpalignr $8,%xmm3,%xmm4,%xmm5 vmovdqu %xmm5,-16(%rdi) vpunpckhqdq %xmm0,%xmm0,%xmm3 - vpxor %xmm0,%xmm3,%xmm3# - vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1####### - vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0####### - vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3####### - vpxor %xmm0,%xmm1,%xmm4# - vpxor %xmm4,%xmm3,%xmm3# - - vpslldq $8,%xmm3,%xmm4# + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 vpsrldq $8,%xmm3,%xmm3 - vpxor %xmm4,%xmm0,%xmm0# + vpxor %xmm4,%xmm0,%xmm0 vpxor %xmm3,%xmm1,%xmm1 vpsllq $57,%xmm0,%xmm3 vpsllq $62,%xmm0,%xmm4 - vpxor %xmm3,%xmm4,%xmm4# + vpxor %xmm3,%xmm4,%xmm4 vpsllq $63,%xmm0,%xmm3 - vpxor %xmm3,%xmm4,%xmm4# - vpslldq $8,%xmm4,%xmm3# + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 vpsrldq $8,%xmm4,%xmm4 - vpxor %xmm3,%xmm0,%xmm0# + vpxor %xmm3,%xmm0,%xmm0 vpxor %xmm4,%xmm1,%xmm1 vpsrlq $1,%xmm0,%xmm4 vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm0,%xmm0# + vpxor %xmm4,%xmm0,%xmm0 vpsrlq $5,%xmm4,%xmm4 - vpxor %xmm4,%xmm0,%xmm0# - vpsrlq $1,%xmm0,%xmm0# - vpxor %xmm1,%xmm0,%xmm0# + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 .Linit_start_avx: vmovdqa %xmm0,%xmm5 vpunpckhqdq %xmm0,%xmm0,%xmm3 - vpxor %xmm0,%xmm3,%xmm3# - vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1####### - vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0####### - vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3####### - vpxor %xmm0,%xmm1,%xmm4# - vpxor %xmm4,%xmm3,%xmm3# - - vpslldq $8,%xmm3,%xmm4# + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 vpsrldq $8,%xmm3,%xmm3 - vpxor %xmm4,%xmm0,%xmm0# + vpxor %xmm4,%xmm0,%xmm0 vpxor %xmm3,%xmm1,%xmm1 vpsllq $57,%xmm0,%xmm3 vpsllq $62,%xmm0,%xmm4 - vpxor %xmm3,%xmm4,%xmm4# + vpxor %xmm3,%xmm4,%xmm4 vpsllq $63,%xmm0,%xmm3 - vpxor %xmm3,%xmm4,%xmm4# - vpslldq $8,%xmm4,%xmm3# + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 vpsrldq $8,%xmm4,%xmm4 - vpxor %xmm3,%xmm0,%xmm0# + vpxor %xmm3,%xmm0,%xmm0 vpxor %xmm4,%xmm1,%xmm1 vpsrlq $1,%xmm0,%xmm4 vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm0,%xmm0# + vpxor %xmm4,%xmm0,%xmm0 vpsrlq $5,%xmm4,%xmm4 - vpxor %xmm4,%xmm0,%xmm0# - vpsrlq $1,%xmm0,%xmm0# - vpxor %xmm1,%xmm0,%xmm0# + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 vpshufd $78,%xmm5,%xmm3 vpshufd $78,%xmm0,%xmm4 vpxor %xmm5,%xmm3,%xmm3 diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S index 635282bbe02cb6..3a575228bc2d89 100644 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S +++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S @@ -35,7 +35,7 @@ .LordK: .quad 0xccd1c8aaee00bc4f -################################################################################ + .globl ecp_nistz256_neg .hidden ecp_nistz256_neg @@ -93,7 +93,7 @@ ecp_nistz256_neg: .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_neg,.-ecp_nistz256_neg -################################################################################ + @@ -135,7 +135,7 @@ ecp_nistz256_ord_mul_mont: leaq .Lord(%rip),%r14 movq .LordK(%rip),%r15 -################################ + movq %rax,%rcx mulq 0(%rsi) movq %rax,%r8 @@ -163,7 +163,7 @@ ecp_nistz256_ord_mul_mont: adcq $0,%rdx movq %rdx,%r12 -################################ + mulq 0(%r14) movq %r8,%rbp addq %rax,%r13 @@ -193,7 +193,7 @@ ecp_nistz256_ord_mul_mont: adcq %rbp,%r12 adcq $0,%r13 -################################ + movq %rax,%rcx mulq 0(%rsi) addq %rax,%r9 @@ -229,7 +229,7 @@ ecp_nistz256_ord_mul_mont: adcq %rdx,%r13 adcq $0,%r8 -################################ + mulq 0(%r14) movq %r9,%rbp addq %rax,%rcx @@ -258,7 +258,7 @@ ecp_nistz256_ord_mul_mont: adcq %rbp,%r13 adcq $0,%r8 -################################# + movq %rax,%rcx mulq 0(%rsi) addq %rax,%r10 @@ -294,7 +294,7 @@ ecp_nistz256_ord_mul_mont: adcq %rdx,%r8 adcq $0,%r9 -################################ + mulq 0(%r14) movq %r10,%rbp addq %rax,%rcx @@ -323,7 +323,7 @@ ecp_nistz256_ord_mul_mont: adcq %rbp,%r8 adcq $0,%r9 -################################ + movq %rax,%rcx mulq 0(%rsi) addq %rax,%r11 @@ -359,7 +359,7 @@ ecp_nistz256_ord_mul_mont: adcq %rdx,%r9 adcq $0,%r10 -################################ + mulq 0(%r14) movq %r11,%rbp addq %rax,%rcx @@ -387,7 +387,7 @@ ecp_nistz256_ord_mul_mont: adcq %rbp,%r9 adcq $0,%r10 -################################ + movq %r12,%rsi subq 0(%r14),%r12 movq %r13,%r11 @@ -427,7 +427,7 @@ ecp_nistz256_ord_mul_mont: .cfi_endproc .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont -################################################################################ + @@ -474,7 +474,7 @@ ecp_nistz256_ord_sqr_mont: .align 32 .Loop_ord_sqr: -################################ + movq %rax,%rbp mulq %r8 movq %rax,%r9 @@ -496,13 +496,13 @@ ecp_nistz256_ord_sqr_mont: adcq $0,%rdx movq %rdx,%r12 -################################ + mulq %r14 movq %rax,%r13 movq %r14,%rax movq %rdx,%r14 -################################ + mulq %rbp addq %rax,%r11 movq %r15,%rax @@ -517,7 +517,7 @@ ecp_nistz256_ord_sqr_mont: adcq %rdx,%r13 adcq $0,%r14 -################################ + xorq %r15,%r15 movq %r8,%rax addq %r9,%r9 @@ -528,7 +528,7 @@ ecp_nistz256_ord_sqr_mont: adcq %r14,%r14 adcq $0,%r15 -################################ + mulq %rax movq %rax,%r8 .byte 102,72,15,126,200 @@ -557,7 +557,7 @@ ecp_nistz256_ord_sqr_mont: movq 0(%rsi),%rax adcq %rdx,%r15 -################################ + mulq %r8 movq %r8,%rbp addq %rax,%rcx @@ -588,7 +588,7 @@ ecp_nistz256_ord_sqr_mont: addq %rbp,%r11 adcq $0,%r8 -################################ + mulq %r9 movq %r9,%rbp addq %rax,%rcx @@ -619,7 +619,7 @@ ecp_nistz256_ord_sqr_mont: addq %rbp,%r8 adcq $0,%r9 -################################ + mulq %r10 movq %r10,%rbp addq %rax,%rcx @@ -650,7 +650,7 @@ ecp_nistz256_ord_sqr_mont: addq %rbp,%r9 adcq $0,%r10 -################################ + mulq %r11 movq %r11,%rbp addq %rax,%rcx @@ -677,7 +677,7 @@ ecp_nistz256_ord_sqr_mont: addq %rbp,%r10 adcq $0,%r11 -################################ + xorq %rdx,%rdx addq %r12,%r8 adcq %r13,%r9 @@ -687,7 +687,7 @@ ecp_nistz256_ord_sqr_mont: movq %r9,%rax adcq $0,%rdx -################################ + subq 0(%rsi),%r8 movq %r10,%r14 sbbq 8(%rsi),%r9 @@ -730,7 +730,7 @@ ecp_nistz256_ord_sqr_mont: .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont -################################################################################ + .type ecp_nistz256_ord_mul_montx,@function .align 32 ecp_nistz256_ord_mul_montx: @@ -766,7 +766,7 @@ ecp_nistz256_ord_mul_montx: leaq .Lord-128(%rip),%r14 movq .LordK(%rip),%r15 -################################ + mulxq %r9,%r8,%r9 mulxq %r10,%rcx,%r10 mulxq %r11,%rbp,%r11 @@ -778,7 +778,7 @@ ecp_nistz256_ord_mul_montx: adcq %rcx,%r11 adcq $0,%r12 -################################ + xorq %r13,%r13 mulxq 0+128(%r14),%rcx,%rbp adcxq %rcx,%r8 @@ -800,7 +800,7 @@ ecp_nistz256_ord_mul_montx: adoxq %r8,%r13 adcq $0,%r13 -################################ + mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r9 adoxq %rbp,%r10 @@ -823,7 +823,7 @@ ecp_nistz256_ord_mul_montx: adoxq %r8,%r8 adcq $0,%r8 -################################ + mulxq 0+128(%r14),%rcx,%rbp adcxq %rcx,%r9 adoxq %rbp,%r10 @@ -844,7 +844,7 @@ ecp_nistz256_ord_mul_montx: adoxq %r9,%r8 adcq $0,%r8 -################################ + mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 @@ -867,7 +867,7 @@ ecp_nistz256_ord_mul_montx: adoxq %r9,%r9 adcq $0,%r9 -################################ + mulxq 0+128(%r14),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 @@ -888,7 +888,7 @@ ecp_nistz256_ord_mul_montx: adoxq %r10,%r9 adcq $0,%r9 -################################ + mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 @@ -911,7 +911,7 @@ ecp_nistz256_ord_mul_montx: adoxq %r10,%r10 adcq $0,%r10 -################################ + mulxq 0+128(%r14),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 @@ -934,7 +934,7 @@ ecp_nistz256_ord_mul_montx: adoxq %r11,%r10 adcq $0,%r10 -################################# + movq %r8,%rcx subq 0(%r14),%r12 @@ -1019,7 +1019,7 @@ ecp_nistz256_ord_sqr_montx: adcq %rbp,%r11 adcq $0,%r12 xorq %r13,%r13 -################################# + mulxq %r15,%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 @@ -1029,7 +1029,7 @@ ecp_nistz256_ord_sqr_montx: adcxq %rcx,%r12 adoxq %rbp,%r13 adcq $0,%r13 -################################# + mulxq %r8,%rcx,%r14 movq %rax,%rdx .byte 102,73,15,110,216 @@ -1039,7 +1039,7 @@ ecp_nistz256_ord_sqr_montx: adcxq %r10,%r10 adoxq %r15,%r14 -################################ + mulxq %rdx,%r8,%rbp .byte 102,72,15,126,202 adcxq %r11,%r11 @@ -1061,7 +1061,7 @@ ecp_nistz256_ord_sqr_montx: adoxq %rcx,%r14 adoxq %rax,%r15 -################################ + movq %r8,%rdx mulxq 32(%rsi),%rdx,%rcx @@ -1080,7 +1080,7 @@ ecp_nistz256_ord_sqr_montx: adoxq %rbp,%r8 adcxq %rax,%r8 -################################# + movq %r9,%rdx mulxq 32(%rsi),%rdx,%rcx @@ -1098,7 +1098,7 @@ ecp_nistz256_ord_sqr_montx: adcxq %rbp,%r9 adoxq %rax,%r9 -################################# + movq %r10,%rdx mulxq 32(%rsi),%rdx,%rcx @@ -1116,7 +1116,7 @@ ecp_nistz256_ord_sqr_montx: adoxq %rbp,%r10 adcxq %rax,%r10 -################################# + movq %r11,%rdx mulxq 32(%rsi),%rdx,%rcx @@ -1134,7 +1134,7 @@ ecp_nistz256_ord_sqr_montx: adcxq %rbp,%r11 adoxq %rax,%r11 -################################ + addq %r8,%r12 adcq %r13,%r9 movq %r12,%rdx @@ -1143,7 +1143,7 @@ ecp_nistz256_ord_sqr_montx: movq %r9,%r14 adcq $0,%rax -################################ + subq 0(%rsi),%r12 movq %r10,%r15 sbbq 8(%rsi),%r9 @@ -1186,7 +1186,7 @@ ecp_nistz256_ord_sqr_montx: .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx -################################################################################ + @@ -1268,7 +1268,7 @@ ecp_nistz256_mul_mont: .align 32 __ecp_nistz256_mul_montq: .cfi_startproc -######################################################################## + movq %rax,%rbp mulq %r9 @@ -1297,12 +1297,12 @@ __ecp_nistz256_mul_montq: xorq %r13,%r13 movq %rdx,%r12 -######################################################################## -# + + @@ -1318,7 +1318,7 @@ __ecp_nistz256_mul_montq: adcq $0,%r13 xorq %r8,%r8 -######################################################################## + movq %rax,%rbp mulq 0(%rsi) @@ -1351,7 +1351,7 @@ __ecp_nistz256_mul_montq: adcq %rdx,%r13 adcq $0,%r8 -######################################################################## + movq %r9,%rbp shlq $32,%r9 @@ -1365,7 +1365,7 @@ __ecp_nistz256_mul_montq: adcq $0,%r8 xorq %r9,%r9 -######################################################################## + movq %rax,%rbp mulq 0(%rsi) @@ -1398,7 +1398,7 @@ __ecp_nistz256_mul_montq: adcq %rdx,%r8 adcq $0,%r9 -######################################################################## + movq %r10,%rbp shlq $32,%r10 @@ -1412,7 +1412,7 @@ __ecp_nistz256_mul_montq: adcq $0,%r9 xorq %r10,%r10 -######################################################################## + movq %rax,%rbp mulq 0(%rsi) @@ -1445,7 +1445,7 @@ __ecp_nistz256_mul_montq: adcq %rdx,%r9 adcq $0,%r10 -######################################################################## + movq %r11,%rbp shlq $32,%r11 @@ -1459,7 +1459,7 @@ __ecp_nistz256_mul_montq: movq %r13,%rbp adcq $0,%r10 -######################################################################## + subq $-1,%r12 movq %r8,%rbx @@ -1482,7 +1482,7 @@ __ecp_nistz256_mul_montq: .cfi_endproc .size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq -################################################################################ + @@ -1578,7 +1578,7 @@ __ecp_nistz256_sqr_montq: adcq $0,%rdx movq %rdx,%r12 -################################# + mulq %r14 addq %rax,%r11 movq %r8,%rax @@ -1593,7 +1593,7 @@ __ecp_nistz256_sqr_montq: movq %rdx,%r13 adcq $0,%r13 -################################# + mulq %r15 xorq %r15,%r15 addq %rax,%r13 @@ -1637,7 +1637,7 @@ __ecp_nistz256_sqr_montq: movq .Lpoly+8(%rip),%rsi movq .Lpoly+24(%rip),%rbp -########################################## + movq %r8,%rcx @@ -1650,7 +1650,7 @@ __ecp_nistz256_sqr_montq: movq %r9,%rax adcq $0,%rdx -########################################## + movq %r9,%rcx shlq $32,%r9 @@ -1663,7 +1663,7 @@ __ecp_nistz256_sqr_montq: movq %r10,%rax adcq $0,%rdx -########################################## + movq %r10,%rcx shlq $32,%r10 @@ -1676,7 +1676,7 @@ __ecp_nistz256_sqr_montq: movq %r11,%rax adcq $0,%rdx -########################################### + movq %r11,%rcx shlq $32,%r11 @@ -1689,7 +1689,7 @@ __ecp_nistz256_sqr_montq: adcq $0,%rdx xorq %r11,%r11 -############################################ + addq %r8,%r12 adcq %r9,%r13 @@ -1723,7 +1723,7 @@ __ecp_nistz256_sqr_montq: .align 32 __ecp_nistz256_mul_montx: .cfi_startproc -######################################################################## + mulxq %r9,%r8,%r9 mulxq %r10,%rcx,%r10 @@ -1740,7 +1740,7 @@ __ecp_nistz256_mul_montx: shrxq %r14,%r8,%rcx adcq $0,%r12 -######################################################################## + addq %rbp,%r9 adcq %rcx,%r10 @@ -1752,7 +1752,7 @@ __ecp_nistz256_mul_montx: adcq $0,%r13 xorq %r8,%r8 -######################################################################## + mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r9 @@ -1777,7 +1777,7 @@ __ecp_nistz256_mul_montx: adoxq %r8,%r8 adcq $0,%r8 -######################################################################## + addq %rcx,%r10 adcq %rbp,%r11 @@ -1789,7 +1789,7 @@ __ecp_nistz256_mul_montx: adcq $0,%r8 xorq %r9,%r9 -######################################################################## + mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r10 @@ -1814,7 +1814,7 @@ __ecp_nistz256_mul_montx: adoxq %r9,%r9 adcq $0,%r9 -######################################################################## + addq %rcx,%r11 adcq %rbp,%r12 @@ -1826,7 +1826,7 @@ __ecp_nistz256_mul_montx: adcq $0,%r9 xorq %r10,%r10 -######################################################################## + mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r11 @@ -1851,7 +1851,7 @@ __ecp_nistz256_mul_montx: adoxq %r10,%r10 adcq $0,%r10 -######################################################################## + addq %rcx,%r12 adcq %rbp,%r13 @@ -1864,7 +1864,7 @@ __ecp_nistz256_mul_montx: adcq %rbp,%r9 adcq $0,%r10 -######################################################################## + xorl %eax,%eax movq %r8,%rcx @@ -1902,7 +1902,7 @@ __ecp_nistz256_sqr_montx: adcq $0,%r12 xorq %r13,%r13 -################################# + mulxq %r15,%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 @@ -1913,7 +1913,7 @@ __ecp_nistz256_sqr_montx: adoxq %rbp,%r13 adcq $0,%r13 -################################# + mulxq %r8,%rcx,%r14 movq 0+128(%rsi),%rdx xorq %r15,%r15 @@ -2017,7 +2017,7 @@ __ecp_nistz256_sqr_montx: .byte 0xf3,0xc3 .cfi_endproc .size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx -################################################################################ + .globl ecp_nistz256_select_w5 .hidden ecp_nistz256_select_w5 @@ -2084,7 +2084,7 @@ ecp_nistz256_select_w5: .LSEH_end_ecp_nistz256_select_w5: .size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 -################################################################################ + .globl ecp_nistz256_select_w7 .hidden ecp_nistz256_select_w7 @@ -2139,7 +2139,7 @@ ecp_nistz256_select_w7: .cfi_endproc .LSEH_end_ecp_nistz256_select_w7: .size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 -################################################################################ + .type ecp_nistz256_avx2_select_w5,@function .align 32 @@ -2203,7 +2203,7 @@ ecp_nistz256_avx2_select_w5: .LSEH_end_ecp_nistz256_avx2_select_w5: .size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5 -################################################################################ + .globl ecp_nistz256_avx2_select_w7 .hidden ecp_nistz256_avx2_select_w7 @@ -2908,9 +2908,9 @@ ecp_nistz256_point_add: movq 24+32(%rsp),%r12 leaq 192(%rsp),%rdi call __ecp_nistz256_mul_montq -#lea 192(%rsp), %rsi -#lea 32(%rsp), %rdi -#call __ecp_nistz256_mul_by_2 + + + xorq %r11,%r11 addq %r12,%r12 @@ -3153,7 +3153,7 @@ ecp_nistz256_point_add_affine: pcmpeqd %xmm4,%xmm5 pshufd $0xb1,%xmm3,%xmm4 movq 0(%rbx),%rax -#lea 0x00(%rbx), %rbx + movq %r12,%r9 por %xmm3,%xmm4 pshufd $0,%xmm5,%xmm5 @@ -3243,9 +3243,9 @@ ecp_nistz256_point_add_affine: movq 24+128(%rsp),%r12 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montq -#lea 0(%rsp), %rsi -#lea 128(%rsp), %rdi -#call __ecp_nistz256_mul_by_2 + + + xorq %r11,%r11 addq %r12,%r12 @@ -4029,9 +4029,9 @@ ecp_nistz256_point_addx: movq 24+32(%rsp),%r12 leaq 192(%rsp),%rdi call __ecp_nistz256_mul_montx -#lea 192(%rsp), %rsi -#lea 32(%rsp), %rdi -#call __ecp_nistz256_mul_by_2 + + + xorq %r11,%r11 addq %r12,%r12 @@ -4268,7 +4268,7 @@ ecp_nistz256_point_add_affinex: pcmpeqd %xmm4,%xmm5 pshufd $0xb1,%xmm3,%xmm4 movq 0(%rbx),%rdx -#lea 0x00(%rbx), %rbx + movq %r12,%r9 por %xmm3,%xmm4 pshufd $0,%xmm5,%xmm5 @@ -4358,9 +4358,9 @@ ecp_nistz256_point_add_affinex: movq 24+128(%rsp),%r12 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montx -#lea 0(%rsp), %rsi -#lea 128(%rsp), %rdi -#call __ecp_nistz256_mul_by_2 + + + xorq %r11,%r11 addq %r12,%r12 diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S index d7b0cb4b987a0a..fefccd6fdf2849 100644 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S +++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S @@ -23,14 +23,13 @@ CRYPTO_rdrand: .cfi_startproc xorq %rax,%rax - - -.byte 0x48, 0x0f, 0xc7, 0xf1 +.byte 72,15,199,242 adcq %rax,%rax - movq %rcx,0(%rdi) + movq %rdx,0(%rdi) .byte 0xf3,0xc3 .cfi_endproc +.size CRYPTO_rdrand,.-CRYPTO_rdrand @@ -46,9 +45,7 @@ CRYPTO_rdrand_multiple8_buf: jz .Lout movq $8,%rdx .Lloop: - - -.byte 0x48, 0x0f, 0xc7, 0xf1 +.byte 72,15,199,241 jnc .Lerr movq %rcx,0(%rdi) addq %rdx,%rdi @@ -61,4 +58,5 @@ CRYPTO_rdrand_multiple8_buf: xorq %rax,%rax .byte 0xf3,0xc3 .cfi_endproc +.size CRYPTO_rdrand_multiple8_buf,.-CRYPTO_rdrand_multiple8_buf #endif diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S index bfdc965f3d36dc..579c705556506a 100644 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S +++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S @@ -382,9 +382,9 @@ rsaz_1024_sqr_avx2: vpaddq %ymm10,%ymm7,%ymm7 vpmuludq 256-128(%r13),%ymm12,%ymm14 vmovd %eax,%xmm12 -#vmovdqu 32*1-8-128(%r13), %ymm11 + vpaddq %ymm14,%ymm8,%ymm8 -#vmovdqu 32*2-8-128(%r13), %ymm10 + vpbroadcastq %xmm12,%ymm12 vpmuludq 32-8-128(%r13),%ymm13,%ymm11 @@ -460,7 +460,7 @@ rsaz_1024_sqr_avx2: addq %r12,%rax vpaddq %ymm14,%ymm7,%ymm7 vpmuludq %ymm12,%ymm11,%ymm11 -#vmovdqu 32*2-24-128(%r13), %ymm14 + movq %rax,%r9 imull %ecx,%eax vpaddq %ymm11,%ymm8,%ymm8 diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S index 0ce9566942a681..55b540f161c2a5 100644 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S +++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S @@ -1819,8 +1819,8 @@ sha256_block_data_order_ssse3: movl 20(%rdi),%r9d movl 24(%rdi),%r10d movl 28(%rdi),%r11d -#movdqa K256+512+32(%rip),%xmm8 -#movdqa K256+512+64(%rip),%xmm9 + + jmp .Lloop_ssse3 .align 16 .Lloop_ssse3: diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S index 8cdda6c9a6c15d..232ddf99f411d1 100644 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S +++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S @@ -13,21 +13,21 @@ #endif .text -## -# -## -# -## -# -# -# -# -## -# -# -# -## -## + + + + + + + + + + + + + + + .type _vpaes_encrypt_core,@function .align 16 _vpaes_encrypt_core: @@ -115,11 +115,11 @@ _vpaes_encrypt_core: .cfi_endproc .size _vpaes_encrypt_core,.-_vpaes_encrypt_core -## -# -## -# -## + + + + + .type _vpaes_decrypt_core,@function .align 16 _vpaes_decrypt_core: @@ -149,9 +149,9 @@ _vpaes_decrypt_core: .align 16 .Ldec_loop: -## -# -## + + + movdqa -32(%r10),%xmm4 movdqa -16(%r10),%xmm1 .byte 102,15,56,0,226 @@ -223,11 +223,11 @@ _vpaes_decrypt_core: .cfi_endproc .size _vpaes_decrypt_core,.-_vpaes_decrypt_core -######################################################## -# -# -# -######################################################## + + + + + .type _vpaes_schedule_core,@function .align 16 _vpaes_schedule_core: @@ -268,14 +268,14 @@ _vpaes_schedule_core: je .Lschedule_192 -## -# -## -# -## -# -# -## + + + + + + + + .Lschedule_128: movl $10,%esi @@ -286,21 +286,21 @@ _vpaes_schedule_core: call _vpaes_schedule_mangle jmp .Loop_schedule_128 -## -# -## -# -## -# -# -# -# -## -# -# -# -# -## + + + + + + + + + + + + + + + .align 16 .Lschedule_192: movdqu 8(%rdi),%xmm0 @@ -323,16 +323,16 @@ _vpaes_schedule_core: call _vpaes_schedule_192_smear jmp .Loop_schedule_192 -## -# -## -# -## -# -# -# -# -## + + + + + + + + + + .align 16 .Lschedule_256: movdqu 16(%rdi),%xmm0 @@ -359,16 +359,16 @@ _vpaes_schedule_core: jmp .Loop_schedule_256 -## -# -## -# -# -# -# -## -# -## + + + + + + + + + + .align 16 .Lschedule_mangle_last: @@ -401,20 +401,20 @@ _vpaes_schedule_core: .cfi_endproc .size _vpaes_schedule_core,.-_vpaes_schedule_core -## -# -## -# -## -# -# -# -# -## -# -# -# -## + + + + + + + + + + + + + + .type _vpaes_schedule_192_smear,@function .align 16 _vpaes_schedule_192_smear: @@ -430,24 +430,24 @@ _vpaes_schedule_192_smear: .cfi_endproc .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear -## -# -## -# -## -# -# -# -## -# -# -## -# -# -## -# -# -## + + + + + + + + + + + + + + + + + + .type _vpaes_schedule_round,@function .align 16 _vpaes_schedule_round: @@ -508,15 +508,15 @@ _vpaes_schedule_low_round: .cfi_endproc .size _vpaes_schedule_round,.-_vpaes_schedule_round -## -# -## -# -## -# -# -# -## + + + + + + + + + .type _vpaes_schedule_transform,@function .align 16 _vpaes_schedule_transform: @@ -534,29 +534,29 @@ _vpaes_schedule_transform: .cfi_endproc .size _vpaes_schedule_transform,.-_vpaes_schedule_transform -## -# -## -# -# -## -# -# -# -# -## -# -# -# -# -# -## -## -# -# -# -# -## + + + + + + + + + + + + + + + + + + + + + + + .type _vpaes_schedule_mangle,@function .align 16 _vpaes_schedule_mangle: @@ -628,9 +628,9 @@ _vpaes_schedule_mangle: .cfi_endproc .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle -# -# + + .globl vpaes_set_encrypt_key .hidden vpaes_set_encrypt_key .type vpaes_set_encrypt_key,@function @@ -744,12 +744,12 @@ vpaes_cbc_encrypt: .byte 0xf3,0xc3 .cfi_endproc .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt -## -# -## -# -# -## + + + + + + .type _vpaes_preheat,@function .align 16 _vpaes_preheat: @@ -765,11 +765,11 @@ _vpaes_preheat: .byte 0xf3,0xc3 .cfi_endproc .size _vpaes_preheat,.-_vpaes_preheat -######################################################## -# -# -# -######################################################## + + + + + .type _vpaes_consts,@object .align 64 _vpaes_consts: @@ -826,10 +826,10 @@ _vpaes_consts: .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 -## -# -# -## + + + + .Lk_dksd: .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E @@ -843,10 +843,10 @@ _vpaes_consts: .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE -## -# -# -## + + + + .Lk_dipt: .quad 0x0F505B040B545F00, 0x154A411E114E451A .quad 0x86E383E660056500, 0x12771772F491F194 diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont.S index 95d5e1db2db2ae..f3637f01aa8cc6 100644 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont.S +++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont.S @@ -59,7 +59,7 @@ bn_mul_mont: andq $-1024,%r10 -# + @@ -736,11 +736,11 @@ bn_sqr8x_mont: shlq $3+2,%r10 negq %r9 -############################################################## -# + + leaq -64(%rsp,%r9,2),%r11 movq %rsp,%rbp movq (%r8),%r8 @@ -943,7 +943,6 @@ bn_mulx4x_mont: .Lmulx4x_page_walk_done: leaq (%rdx,%r9,1),%r10 -############################################################## @@ -954,7 +953,8 @@ bn_mulx4x_mont: -# + + movq %r9,0(%rsp) shrq $5,%r9 movq %r10,16(%rsp) diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S index 71d5e9febb8c99..b12393e201071d 100644 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S +++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S @@ -54,7 +54,7 @@ bn_mul_mont_gather5: andq $-1024,%r10 -# + @@ -487,7 +487,6 @@ bn_mul4x_mont_gather5: leaq (%r9,%r9,2),%r10 negq %r9 -############################################################## @@ -495,7 +494,8 @@ bn_mul4x_mont_gather5: -# + + leaq -320(%rsp,%r9,2),%r11 movq %rsp,%rbp subq %rdi,%r11 @@ -566,6 +566,7 @@ bn_mul4x_mont_gather5: .type mul4x_internal,@function .align 32 mul4x_internal: +.cfi_startproc shlq $5,%r9 movd 8(%rax),%xmm5 leaq .Linc(%rip),%rax @@ -1087,6 +1088,7 @@ mul4x_internal: movq 16(%rbp),%r14 movq 24(%rbp),%r15 jmp .Lsqr4x_sub_entry +.cfi_endproc .size mul4x_internal,.-mul4x_internal .globl bn_power5 .hidden bn_power5 @@ -1120,13 +1122,13 @@ bn_power5: negq %r9 movq (%r8),%r8 -############################################################## -# + + leaq -320(%rsp,%r9,2),%r11 movq %rsp,%rbp subq %rdi,%r11 @@ -1166,15 +1168,15 @@ bn_power5: movq %r9,%r10 negq %r9 -############################################################## -# -# + + + movq %r8,32(%rsp) movq %rax,40(%rsp) .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 @@ -1232,14 +1234,15 @@ bn_power5: .align 32 bn_sqr8x_internal: __bn_sqr8x_internal: -############################################################## +.cfi_startproc + + + + -# -# -############################################################## @@ -2006,10 +2009,12 @@ __bn_sqr8x_reduction: cmpq %rdx,%rdi jb .L8x_reduction_loop .byte 0xf3,0xc3 +.cfi_endproc .size bn_sqr8x_internal,.-bn_sqr8x_internal .type __bn_post4x_internal,@function .align 32 __bn_post4x_internal: +.cfi_startproc movq 0(%rbp),%r12 leaq (%rdi,%r9,1),%rbx movq %r9,%rcx @@ -2060,16 +2065,19 @@ __bn_post4x_internal: movq %r9,%r10 negq %r9 .byte 0xf3,0xc3 +.cfi_endproc .size __bn_post4x_internal,.-__bn_post4x_internal .globl bn_from_montgomery .hidden bn_from_montgomery .type bn_from_montgomery,@function .align 32 bn_from_montgomery: +.cfi_startproc testl $7,%r9d jz bn_from_mont8x xorl %eax,%eax .byte 0xf3,0xc3 +.cfi_endproc .size bn_from_montgomery,.-bn_from_montgomery .type bn_from_mont8x,@function @@ -2098,13 +2106,13 @@ bn_from_mont8x: negq %r9 movq (%r8),%r8 -############################################################## -# + + leaq -320(%rsp,%r9,2),%r11 movq %rsp,%rbp subq %rdi,%r11 @@ -2144,15 +2152,15 @@ bn_from_mont8x: movq %r9,%r10 negq %r9 -############################################################## -# -# + + + movq %r8,32(%rsp) movq %rax,40(%rsp) .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 @@ -2266,7 +2274,6 @@ bn_mulx4x_mont_gather5: negq %r9 movq (%r8),%r8 -############################################################## @@ -2274,7 +2281,8 @@ bn_mulx4x_mont_gather5: -# + + leaq -320(%rsp,%r9,2),%r11 movq %rsp,%rbp subq %rdi,%r11 @@ -2310,7 +2318,6 @@ bn_mulx4x_mont_gather5: ja .Lmulx4x_page_walk .Lmulx4x_page_walk_done: -############################################################## @@ -2321,7 +2328,8 @@ bn_mulx4x_mont_gather5: -# + + movq %r8,32(%rsp) movq %rax,40(%rsp) .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 @@ -2354,6 +2362,7 @@ bn_mulx4x_mont_gather5: .type mulx4x_internal,@function .align 32 mulx4x_internal: +.cfi_startproc movq %r9,8(%rsp) movq %r9,%r10 negq %r9 @@ -2772,6 +2781,7 @@ mulx4x_internal: movq 16(%rbp),%r14 movq 24(%rbp),%r15 jmp .Lsqrx4x_sub_entry +.cfi_endproc .size mulx4x_internal,.-mulx4x_internal .type bn_powerx5,@function .align 32 @@ -2799,13 +2809,13 @@ bn_powerx5: negq %r9 movq (%r8),%r8 -############################################################## -# + + leaq -320(%rsp,%r9,2),%r11 movq %rsp,%rbp subq %rdi,%r11 @@ -2845,9 +2855,7 @@ bn_powerx5: movq %r9,%r10 negq %r9 -############################################################## -# @@ -2855,7 +2863,9 @@ bn_powerx5: -# + + + pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,72,15,110,209 @@ -2916,14 +2926,10 @@ bn_powerx5: bn_sqrx8x_internal: __bn_sqrx8x_internal: .cfi_startproc -################################################################## -# -# -################################################################## @@ -2931,7 +2937,6 @@ __bn_sqrx8x_internal: -# @@ -2948,7 +2953,12 @@ __bn_sqrx8x_internal: -# + + + + + + @@ -2980,7 +2990,7 @@ __bn_sqrx8x_internal: jnz .Lsqrx8x_zero movq 0(%rsi),%rdx -#xor %r9,%r9 + xorq %r10,%r10 xorq %r11,%r11 xorq %r12,%r12 @@ -3097,7 +3107,7 @@ __bn_sqrx8x_internal: movq %r14,%rdx adoxq %rbx,%r11 adcxq %r12,%r11 -#adox %rbp,%rax + adcxq %rbp,%rax mulxq %r15,%r14,%rbx @@ -3136,7 +3146,7 @@ __bn_sqrx8x_internal: movq %rax,16+8(%rsp) movq %rdi,24+8(%rsp) -#lea 8*8(%rdi),%rdi + xorl %eax,%eax jmp .Lsqrx8x_loop @@ -3259,7 +3269,7 @@ __bn_sqrx8x_internal: adoxq %r11,%r11 movq 16(%rdi),%r12 movq 24(%rdi),%r13 -#jmp .Lsqrx4x_shift_n_add + .align 32 .Lsqrx4x_shift_n_add: @@ -3324,7 +3334,7 @@ __bn_sqrx8x_reduction: movq 32+8(%rsp),%rbx movq 48+8(%rsp),%rdx leaq -64(%rbp,%r9,1),%rcx -#lea 48+8(%rsp,%r9,2),%rdi + movq %rcx,0+8(%rsp) movq %rdi,8+8(%rsp) @@ -3530,13 +3540,15 @@ __bn_sqrx8x_reduction: .cfi_endproc .size bn_sqrx8x_internal,.-bn_sqrx8x_internal .align 32 +.type __bn_postx4x_internal,@function __bn_postx4x_internal: +.cfi_startproc movq 0(%rbp),%r12 movq %rcx,%r10 movq %rcx,%r9 negq %rax sarq $3+2,%rcx -#lea 48+8(%rsp,%r9),%rdi + .byte 102,72,15,126,202 .byte 102,72,15,126,206 decq %r12 @@ -3578,12 +3590,14 @@ __bn_postx4x_internal: negq %r9 .byte 0xf3,0xc3 +.cfi_endproc .size __bn_postx4x_internal,.-__bn_postx4x_internal .globl bn_scatter5 .hidden bn_scatter5 .type bn_scatter5,@function .align 16 bn_scatter5: +.cfi_startproc cmpl $0,%esi jz .Lscatter_epilogue leaq (%rdx,%rcx,8),%rdx @@ -3596,6 +3610,7 @@ bn_scatter5: jnz .Lscatter .Lscatter_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_scatter5,.-bn_scatter5 .globl bn_gather5 @@ -3603,10 +3618,12 @@ bn_scatter5: .type bn_gather5,@function .align 32 bn_gather5: +.cfi_startproc .LSEH_begin_bn_gather5: -.byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 -.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub src/crypto/fipsmodule/bn/asm/x86_64-mont5.plx108,%rsp +.byte 0x4c,0x8d,0x14,0x24 +.cfi_def_cfa_register %r10 +.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 leaq .Linc(%rip),%rax andq $-16,%rsp @@ -3759,8 +3776,10 @@ bn_gather5: jnz .Lgather leaq (%r10),%rsp +.cfi_def_cfa_register %rsp .byte 0xf3,0xc3 .LSEH_end_bn_gather5: +.cfi_endproc .size bn_gather5,.-bn_gather5 .align 64 .Linc: diff --git a/third_party/boringssl/mac-x86_64/crypto/chacha/chacha-x86_64.S b/third_party/boringssl/mac-x86_64/crypto/chacha/chacha-x86_64.S index 0bbbf0a4894dcd..10b1ad95205879 100644 --- a/third_party/boringssl/mac-x86_64/crypto/chacha/chacha-x86_64.S +++ b/third_party/boringssl/mac-x86_64/crypto/chacha/chacha-x86_64.S @@ -72,13 +72,13 @@ _ChaCha20_ctr32: L$ctr32_body: -#movdqa .Lsigma(%rip),%xmm0 + movdqu (%rcx),%xmm1 movdqu 16(%rcx),%xmm2 movdqu (%r8),%xmm3 movdqa L$one(%rip),%xmm4 -#movdqa %xmm0,4*0(%rsp) + movdqa %xmm1,16(%rsp) movdqa %xmm2,32(%rsp) movdqa %xmm3,48(%rsp) @@ -871,9 +871,9 @@ L$tail4x: cmpq $64,%rdx jae L$64_or_more4x -#movdqa 0x00(%rsp),%xmm6 + xorq %r10,%r10 -#movdqa %xmm6,0x00(%rsp) + movdqa %xmm12,16(%rsp) movdqa %xmm4,32(%rsp) movdqa %xmm0,48(%rsp) @@ -1027,7 +1027,7 @@ L$ChaCha20_8x: andq $-32,%rsp vzeroupper -############### + diff --git a/third_party/boringssl/mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S b/third_party/boringssl/mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S index 7aa437c7eb6395..0c921b37b5462d 100644 --- a/third_party/boringssl/mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S +++ b/third_party/boringssl/mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S @@ -213,7 +213,7 @@ L$htable_polyval_main_loop: vpclmulqdq $0x10,(%rdi),%xmm0,%xmm6 vpxor %xmm6,%xmm5,%xmm5 -######################################################### + vmovdqu 96(%rsi),%xmm0 vpclmulqdq $0x01,16(%rdi),%xmm0,%xmm6 vpxor %xmm6,%xmm5,%xmm5 @@ -225,7 +225,7 @@ L$htable_polyval_main_loop: vpxor %xmm6,%xmm5,%xmm5 -######################################################### + vmovdqu 80(%rsi),%xmm0 vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7 @@ -242,7 +242,7 @@ L$htable_polyval_main_loop: vpxor %xmm7,%xmm1,%xmm1 -######################################################### + vmovdqu 64(%rsi),%xmm0 vpclmulqdq $0x01,48(%rdi),%xmm0,%xmm6 @@ -254,7 +254,7 @@ L$htable_polyval_main_loop: vpclmulqdq $0x10,48(%rdi),%xmm0,%xmm6 vpxor %xmm6,%xmm5,%xmm5 -######################################################### + vmovdqu 48(%rsi),%xmm0 vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7 @@ -271,7 +271,7 @@ L$htable_polyval_main_loop: vpxor %xmm7,%xmm1,%xmm1 -######################################################### + vmovdqu 32(%rsi),%xmm0 vpclmulqdq $0x01,80(%rdi),%xmm0,%xmm6 @@ -285,7 +285,7 @@ L$htable_polyval_main_loop: vpxor %xmm9,%xmm1,%xmm1 -######################################################### + vmovdqu 16(%rsi),%xmm0 vpclmulqdq $0x01,96(%rdi),%xmm0,%xmm6 @@ -297,7 +297,7 @@ L$htable_polyval_main_loop: vpclmulqdq $0x10,96(%rdi),%xmm0,%xmm6 vpxor %xmm6,%xmm5,%xmm5 -######################################################### + vmovdqu 0(%rsi),%xmm0 vpxor %xmm1,%xmm0,%xmm0 @@ -310,7 +310,7 @@ L$htable_polyval_main_loop: vpclmulqdq $0x10,112(%rdi),%xmm0,%xmm6 vpxor %xmm6,%xmm5,%xmm5 -######################################################### + vpsrldq $8,%xmm5,%xmm6 vpslldq $8,%xmm5,%xmm5 @@ -320,7 +320,7 @@ L$htable_polyval_main_loop: leaq 128(%rsi),%rsi jmp L$htable_polyval_main_loop -######################################################### + L$htable_polyval_out: vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6 @@ -488,10 +488,10 @@ _aes128gcmsiv_aes_ks_enc_x1: vmovdqa con1(%rip),%xmm0 vmovdqa mask(%rip),%xmm15 - vpshufb %xmm15,%xmm1,%xmm2#!!saving mov instruction to xmm2 + vpshufb %xmm15,%xmm1,%xmm2 vaesenclast %xmm0,%xmm2,%xmm2 vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3#!!saving mov instruction to xmm3 + vpsllq $32,%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 vpshufb con3(%rip),%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 @@ -500,10 +500,10 @@ _aes128gcmsiv_aes_ks_enc_x1: vaesenc %xmm1,%xmm4,%xmm4 vmovdqa %xmm1,16(%rdx) - vpshufb %xmm15,%xmm1,%xmm2#!!saving mov instruction to xmm2 + vpshufb %xmm15,%xmm1,%xmm2 vaesenclast %xmm0,%xmm2,%xmm2 vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3#!!saving mov instruction to xmm3 + vpsllq $32,%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 vpshufb con3(%rip),%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 @@ -512,10 +512,10 @@ _aes128gcmsiv_aes_ks_enc_x1: vaesenc %xmm1,%xmm4,%xmm4 vmovdqa %xmm1,32(%rdx) - vpshufb %xmm15,%xmm1,%xmm2#!!saving mov instruction to xmm2 + vpshufb %xmm15,%xmm1,%xmm2 vaesenclast %xmm0,%xmm2,%xmm2 vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3#!!saving mov instruction to xmm3 + vpsllq $32,%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 vpshufb con3(%rip),%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 @@ -524,10 +524,10 @@ _aes128gcmsiv_aes_ks_enc_x1: vaesenc %xmm1,%xmm4,%xmm4 vmovdqa %xmm1,48(%rdx) - vpshufb %xmm15,%xmm1,%xmm2#!!saving mov instruction to xmm2 + vpshufb %xmm15,%xmm1,%xmm2 vaesenclast %xmm0,%xmm2,%xmm2 vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3#!!saving mov instruction to xmm3 + vpsllq $32,%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 vpshufb con3(%rip),%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 @@ -536,10 +536,10 @@ _aes128gcmsiv_aes_ks_enc_x1: vaesenc %xmm1,%xmm4,%xmm4 vmovdqa %xmm1,64(%rdx) - vpshufb %xmm15,%xmm1,%xmm2#!!saving mov instruction to xmm2 + vpshufb %xmm15,%xmm1,%xmm2 vaesenclast %xmm0,%xmm2,%xmm2 vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3#!!saving mov instruction to xmm3 + vpsllq $32,%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 vpshufb con3(%rip),%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 @@ -548,10 +548,10 @@ _aes128gcmsiv_aes_ks_enc_x1: vaesenc %xmm1,%xmm4,%xmm4 vmovdqa %xmm1,80(%rdx) - vpshufb %xmm15,%xmm1,%xmm2#!!saving mov instruction to xmm2 + vpshufb %xmm15,%xmm1,%xmm2 vaesenclast %xmm0,%xmm2,%xmm2 vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3#!!saving mov instruction to xmm3 + vpsllq $32,%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 vpshufb con3(%rip),%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 @@ -560,10 +560,10 @@ _aes128gcmsiv_aes_ks_enc_x1: vaesenc %xmm1,%xmm4,%xmm4 vmovdqa %xmm1,96(%rdx) - vpshufb %xmm15,%xmm1,%xmm2#!!saving mov instruction to xmm2 + vpshufb %xmm15,%xmm1,%xmm2 vaesenclast %xmm0,%xmm2,%xmm2 vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3#!!saving mov instruction to xmm3 + vpsllq $32,%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 vpshufb con3(%rip),%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 @@ -572,10 +572,10 @@ _aes128gcmsiv_aes_ks_enc_x1: vaesenc %xmm1,%xmm4,%xmm4 vmovdqa %xmm1,112(%rdx) - vpshufb %xmm15,%xmm1,%xmm2#!!saving mov instruction to xmm2 + vpshufb %xmm15,%xmm1,%xmm2 vaesenclast %xmm0,%xmm2,%xmm2 vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3#!!saving mov instruction to xmm3 + vpsllq $32,%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 vpshufb con3(%rip),%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 @@ -587,10 +587,10 @@ _aes128gcmsiv_aes_ks_enc_x1: vmovdqa con2(%rip),%xmm0 - vpshufb %xmm15,%xmm1,%xmm2#!!saving mov instruction to xmm2 + vpshufb %xmm15,%xmm1,%xmm2 vaesenclast %xmm0,%xmm2,%xmm2 vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3#!!saving mov instruction to xmm3 + vpsllq $32,%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 vpshufb con3(%rip),%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 @@ -599,9 +599,9 @@ _aes128gcmsiv_aes_ks_enc_x1: vaesenc %xmm1,%xmm4,%xmm4 vmovdqa %xmm1,144(%rdx) - vpshufb %xmm15,%xmm1,%xmm2#!!saving mov instruction to xmm2 + vpshufb %xmm15,%xmm1,%xmm2 vaesenclast %xmm0,%xmm2,%xmm2 - vpsllq $32,%xmm1,%xmm3#!!saving mov instruction to xmm3 + vpsllq $32,%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 vpshufb con3(%rip),%xmm1,%xmm3 vpxor %xmm3,%xmm1,%xmm1 @@ -731,7 +731,7 @@ L$128_enc_msg_x4_start: vmovdqa (%rdx),%xmm15 - vpor OR_MASK(%rip),%xmm15,%xmm15#IV = [1]TAG[126...32][00..00] + vpor OR_MASK(%rip),%xmm15,%xmm15 vmovdqu four(%rip),%xmm4 vmovdqa %xmm15,%xmm0 diff --git a/third_party/boringssl/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S b/third_party/boringssl/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S index 42476a98d696ba..e50227ae38cb89 100644 --- a/third_party/boringssl/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S +++ b/third_party/boringssl/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S @@ -1342,7 +1342,7 @@ open_sse_main_loop: leaq 128(%rdi),%rdi jmp open_sse_tail_64_dec_loop 3: -############################################################################### + movdqa .chacha20_consts(%rip),%xmm0 movdqa 48(%rbp),%xmm4 movdqa 64(%rbp),%xmm8 @@ -1697,7 +1697,7 @@ open_sse_main_loop: subq $192,%rbx leaq 192(%rsi),%rsi leaq 192(%rdi),%rdi -############################################################################### + open_sse_tail_64_dec_loop: cmpq $16,%rbx @@ -1867,7 +1867,7 @@ open_sse_finalize: .byte 0xf3,0xc3 -############################################################################### + open_sse_128: movdqu .chacha20_consts(%rip),%xmm0 movdqa %xmm0,%xmm1 @@ -2101,8 +2101,8 @@ open_sse_128: -################################################################################ -################################################################################ + + .globl _chacha20_poly1305_seal .private_extern _chacha20_poly1305_seal @@ -2838,7 +2838,7 @@ _chacha20_poly1305_seal: movq $6,%rcx cmpq $64,%rbx jg 3f -############################################################################### + seal_sse_tail_64: movdqa .chacha20_consts(%rip),%xmm0 movdqa 48(%rbp),%xmm4 @@ -2990,7 +2990,7 @@ seal_sse_tail_64: 3: cmpq $128,%rbx jg 3f -############################################################################### + seal_sse_tail_128: movdqa .chacha20_consts(%rip),%xmm0 movdqa 48(%rbp),%xmm4 @@ -3207,7 +3207,7 @@ seal_sse_tail_128: leaq 64(%rsi),%rsi jmp seal_sse_128_seal_hash 3: -############################################################################### + seal_sse_tail_192: movdqa .chacha20_consts(%rip),%xmm0 movdqa 48(%rbp),%xmm4 @@ -3486,7 +3486,7 @@ seal_sse_tail_192: movq $128,%rcx subq $128,%rbx leaq 128(%rsi),%rsi -############################################################################### + seal_sse_128_seal_hash: cmpq $16,%rcx jb seal_sse_128_seal @@ -3631,7 +3631,7 @@ seal_sse_tail_16: -# + movq 288+32(%rsp),%r9 @@ -3935,7 +3935,7 @@ do_length_block: .byte 0xf3,0xc3 -################################################################################ + seal_sse_128: movdqu .chacha20_consts(%rip),%xmm0 movdqa %xmm0,%xmm1 @@ -4106,7 +4106,7 @@ seal_sse_128: jmp seal_sse_128_seal -############################################################################### + .p2align 6 chacha20_poly1305_open_avx2: @@ -5818,7 +5818,7 @@ open_avx2_tail: vmovdqa %xmm0,%xmm1 jb 1f subq $16,%rbx -#load for decryption + vpxor (%rsi),%xmm0,%xmm1 vmovdqu %xmm1,(%rdi) leaq 16(%rsi),%rsi @@ -5828,7 +5828,7 @@ open_avx2_tail: 1: vzeroupper jmp open_sse_tail_16 -############################################################################### + open_avx2_192: vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 @@ -6100,7 +6100,7 @@ open_avx2_short_tail_32: 1: vzeroupper jmp open_sse_tail_16 -############################################################################### + open_avx2_320: vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 @@ -6263,8 +6263,8 @@ open_avx2_320: vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 jmp open_avx2_short -############################################################################### -############################################################################### + + .p2align 6 chacha20_poly1305_seal_avx2: @@ -7334,7 +7334,7 @@ chacha20_poly1305_seal_avx2: xorq %r8,%r8 cmpq $128,%rbx ja 3f -############################################################################### + seal_avx2_tail_128: vmovdqa .chacha20_consts(%rip),%ymm0 vmovdqa 64(%rbp),%ymm4 @@ -7528,7 +7528,7 @@ seal_avx2_tail_128: 3: cmpq $256,%rbx ja 3f -############################################################################### + seal_avx2_tail_256: vmovdqa .chacha20_consts(%rip),%ymm0 vmovdqa 64(%rbp),%ymm4 @@ -7784,7 +7784,7 @@ seal_avx2_tail_256: 3: cmpq $384,%rbx ja seal_avx2_tail_512 -############################################################################### + seal_avx2_tail_384: vmovdqa .chacha20_consts(%rip),%ymm0 vmovdqa 64(%rbp),%ymm4 @@ -8096,7 +8096,7 @@ seal_avx2_tail_384: leaq 256(%rsi),%rsi subq $256,%rbx jmp seal_avx2_hash -############################################################################### + seal_avx2_tail_512: vmovdqa .chacha20_consts(%rip),%ymm0 vmovdqa 64(%rbp),%ymm4 @@ -8498,7 +8498,7 @@ seal_avx2_tail_512: leaq 384(%rsi),%rsi subq $384,%rbx jmp seal_avx2_hash -################################################################################ + seal_avx2_320: vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 @@ -8660,7 +8660,7 @@ seal_avx2_320: vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 jmp seal_avx2_short -################################################################################ + seal_avx2_192: vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 diff --git a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aes-x86_64.S b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aes-x86_64.S index b3453160016111..8875d0abbbd17c 100644 --- a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aes-x86_64.S +++ b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aes-x86_64.S @@ -91,8 +91,8 @@ L$enc_loop: movzbl %bh,%edi movzbl %ch,%ebp movzbl 2(%r14,%rsi,8),%r8d - movl 0(%r14,%rdi,8),%edi#%r10d - movl 0(%r14,%rbp,8),%ebp#%r11d + movl 0(%r14,%rdi,8),%edi + movl 0(%r14,%rbp,8),%ebp andl $0x0000ff00,%edi andl $0x0000ff00,%ebp @@ -104,8 +104,8 @@ L$enc_loop: movzbl %dh,%esi movzbl %ah,%edi shrl $16,%edx - movl 0(%r14,%rsi,8),%esi#%r12d - movl 0(%r14,%rdi,8),%edi#%r8d + movl 0(%r14,%rsi,8),%esi + movl 0(%r14,%rdi,8),%edi andl $0x0000ff00,%esi andl $0x0000ff00,%edi @@ -117,9 +117,9 @@ L$enc_loop: movzbl %cl,%esi movzbl %dl,%edi movzbl %al,%ebp - movl 0(%r14,%rsi,8),%esi#%r10d - movl 0(%r14,%rdi,8),%edi#%r11d - movl 0(%r14,%rbp,8),%ebp#%r12d + movl 0(%r14,%rsi,8),%esi + movl 0(%r14,%rdi,8),%edi + movl 0(%r14,%rbp,8),%ebp andl $0x00ff0000,%esi andl $0x00ff0000,%edi @@ -132,9 +132,9 @@ L$enc_loop: movzbl %bl,%esi movzbl %dh,%edi movzbl %ah,%ebp - movl 0(%r14,%rsi,8),%esi#%r8d - movl 2(%r14,%rdi,8),%edi#%r10d - movl 2(%r14,%rbp,8),%ebp#%r11d + movl 0(%r14,%rsi,8),%esi + movl 2(%r14,%rdi,8),%edi + movl 2(%r14,%rbp,8),%ebp andl $0x00ff0000,%esi andl $0xff000000,%edi @@ -147,8 +147,8 @@ L$enc_loop: movzbl %bh,%esi movzbl %ch,%edi movl 16+12(%r15),%edx - movl 2(%r14,%rsi,8),%esi#%r12d - movl 2(%r14,%rdi,8),%edi#%r8d + movl 2(%r14,%rsi,8),%esi + movl 2(%r14,%rdi,8),%edi movl 16+0(%r15),%eax andl $0xff000000,%esi @@ -199,12 +199,12 @@ L$enc_loop_compact: movzbl (%r14,%r12,1),%r12d movzbl (%r14,%r8,1),%r8d - movzbl (%r14,%rsi,1),%r9d#%r10d + movzbl (%r14,%rsi,1),%r9d movzbl %ah,%esi - movzbl (%r14,%rdi,1),%r13d#%r11d + movzbl (%r14,%rdi,1),%r13d movzbl %cl,%edi - movzbl (%r14,%rbp,1),%ebp#%r12d - movzbl (%r14,%rsi,1),%esi#%r8d + movzbl (%r14,%rbp,1),%ebp + movzbl (%r14,%rsi,1),%esi shll $8,%r9d shrl $16,%edx @@ -216,16 +216,16 @@ L$enc_loop_compact: xorl %r13d,%r11d shll $8,%ebp movzbl %al,%r13d - movzbl (%r14,%rdi,1),%edi#%r10d + movzbl (%r14,%rdi,1),%edi xorl %ebp,%r12d shll $8,%esi movzbl %bl,%ebp shll $16,%edi xorl %esi,%r8d - movzbl (%r14,%r9,1),%r9d#%r11d + movzbl (%r14,%r9,1),%r9d movzbl %dh,%esi - movzbl (%r14,%r13,1),%r13d#%r12d + movzbl (%r14,%r13,1),%r13d xorl %edi,%r10d shrl $8,%ecx @@ -234,11 +234,11 @@ L$enc_loop_compact: shrl $8,%ebx shll $16,%r13d xorl %r9d,%r11d - movzbl (%r14,%rbp,1),%ebp#%r8d - movzbl (%r14,%rsi,1),%esi#%r10d - movzbl (%r14,%rdi,1),%edi#%r11d - movzbl (%r14,%rcx,1),%edx#%r8d - movzbl (%r14,%rbx,1),%ecx#%r12d + movzbl (%r14,%rbp,1),%ebp + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%edi + movzbl (%r14,%rcx,1),%edx + movzbl (%r14,%rbx,1),%ecx shll $16,%ebp xorl %r13d,%r12d @@ -504,8 +504,8 @@ L$dec_loop: movzbl %dh,%edi movzbl %ah,%ebp movzbl (%r14,%rsi,1),%r8d - movzbl (%r14,%rdi,1),%edi#%r10d - movzbl (%r14,%rbp,1),%ebp#%r11d + movzbl (%r14,%rdi,1),%edi + movzbl (%r14,%rbp,1),%ebp shll $8,%edi shll $8,%ebp @@ -517,8 +517,8 @@ L$dec_loop: movzbl %bh,%esi movzbl %ch,%edi shrl $16,%eax - movzbl (%r14,%rsi,1),%esi#%r12d - movzbl (%r14,%rdi,1),%edi#%r8d + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%edi shll $8,%esi shll $8,%edi @@ -530,9 +530,9 @@ L$dec_loop: movzbl %cl,%esi movzbl %dl,%edi movzbl %al,%ebp - movzbl (%r14,%rsi,1),%esi#%r10d - movzbl (%r14,%rdi,1),%edi#%r11d - movzbl (%r14,%rbp,1),%ebp#%r12d + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%edi + movzbl (%r14,%rbp,1),%ebp shll $16,%esi shll $16,%edi @@ -545,9 +545,9 @@ L$dec_loop: movzbl %bl,%esi movzbl %bh,%edi movzbl %ch,%ebp - movzbl (%r14,%rsi,1),%esi#%r8d - movzbl (%r14,%rdi,1),%edi#%r10d - movzbl (%r14,%rbp,1),%ebp#%r11d + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%edi + movzbl (%r14,%rbp,1),%ebp shll $16,%esi shll $24,%edi @@ -560,8 +560,8 @@ L$dec_loop: movzbl %dh,%esi movzbl %ah,%edi movl 16+12(%r15),%edx - movzbl (%r14,%rsi,1),%esi#%r12d - movzbl (%r14,%rdi,1),%edi#%r8d + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%edi movl 16+0(%r15),%eax shll $24,%esi @@ -614,11 +614,11 @@ L$dec_loop_compact: movzbl (%r14,%r12,1),%r12d movzbl (%r14,%r8,1),%r8d - movzbl (%r14,%rsi,1),%r9d#%r10d + movzbl (%r14,%rsi,1),%r9d movzbl %ch,%esi - movzbl (%r14,%rdi,1),%r13d#%r11d - movzbl (%r14,%rbp,1),%ebp#%r12d - movzbl (%r14,%rsi,1),%esi#%r8d + movzbl (%r14,%rdi,1),%r13d + movzbl (%r14,%rbp,1),%ebp + movzbl (%r14,%rsi,1),%esi shrl $16,%ecx shll $8,%r13d @@ -633,17 +633,17 @@ L$dec_loop_compact: xorl %r13d,%r11d shll $8,%esi movzbl %al,%r13d - movzbl (%r14,%rdi,1),%edi#%r10d + movzbl (%r14,%rdi,1),%edi xorl %ebp,%r12d movzbl %bl,%ebp shll $16,%edi xorl %esi,%r8d - movzbl (%r14,%r9,1),%r9d#%r11d + movzbl (%r14,%r9,1),%r9d movzbl %bh,%esi - movzbl (%r14,%rbp,1),%ebp#%r8d + movzbl (%r14,%rbp,1),%ebp xorl %edi,%r10d - movzbl (%r14,%r13,1),%r13d#%r12d + movzbl (%r14,%r13,1),%r13d movzbl %ch,%edi shll $16,%ebp @@ -655,10 +655,10 @@ L$dec_loop_compact: shrl $8,%eax xorl %r13d,%r12d - movzbl (%r14,%rsi,1),%esi#%r10d - movzbl (%r14,%rdi,1),%ebx#%r11d - movzbl (%r14,%rbp,1),%ecx#%r12d - movzbl (%r14,%rax,1),%edx#%r8d + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%ebx + movzbl (%r14,%rbp,1),%ecx + movzbl (%r14,%rax,1),%edx movl %r10d,%eax shll $24,%esi @@ -1445,7 +1445,7 @@ L$cbc_te_ok: xchgq %rsp,%r15 -#add $8,%rsp + movq %r15,16(%rsp) L$cbc_fast_body: @@ -1496,7 +1496,7 @@ L$cbc_prefetch_te: cmpq $0,%rbx je L$FAST_DECRYPT -#----------------------------- ENCRYPT -----------------------------# + movl 0(%rbp),%eax movl 4(%rbp),%ebx movl 8(%rbp),%ecx @@ -1534,7 +1534,7 @@ L$cbc_fast_enc_loop: jmp L$cbc_fast_cleanup -#----------------------------- DECRYPT -----------------------------# + .p2align 4 L$FAST_DECRYPT: cmpq %r8,%r9 @@ -1642,7 +1642,7 @@ L$cbc_fast_cleanup: jmp L$cbc_exit -#--------------------------- SLOW ROUTINE ---------------------------# + .p2align 4 L$cbc_slow_prologue: @@ -1658,14 +1658,14 @@ L$cbc_slow_prologue: xchgq %rsp,%rbp -#add $8,%rsp + movq %rbp,16(%rsp) L$cbc_slow_body: -#mov %rdi,24(%rsp) -#mov %rsi,32(%rsp) -#mov %rdx,40(%rsp) -#mov %rcx,48(%rsp) + + + + movq %r8,56(%rsp) movq %r8,%rbp movq %r9,%rbx @@ -1690,7 +1690,7 @@ L$cbc_slow_body: cmpq $0,%rbx je L$SLOW_DECRYPT -#--------------------------- SLOW ENCRYPT ---------------------------# + testq $-16,%r10 movl 0(%rbp),%eax movl 4(%rbp),%ebx @@ -1751,7 +1751,7 @@ L$cbc_slow_enc_tail: movq %r11,%rax movq %r12,%rcx jmp L$cbc_slow_enc_loop -#--------------------------- SLOW DECRYPT ---------------------------# + .p2align 4 L$SLOW_DECRYPT: shrq $3,%rax diff --git a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S index 0944fe4f4dfc38..9fb4bef1e68d80 100644 --- a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S +++ b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S @@ -57,7 +57,7 @@ L$resume_ctr32: -# + diff --git a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-x86_64.S b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-x86_64.S index 3f0870df3dedf8..5e12596731af86 100644 --- a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-x86_64.S +++ b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-x86_64.S @@ -549,7 +549,7 @@ _aes_hw_ecb_encrypt: movl %eax,%r10d testl %r8d,%r8d jz L$ecb_decrypt -#--------------------------- ECB ENCRYPT ------------------------------# + cmpq $0x80,%rdx jb L$ecb_enc_tail @@ -690,7 +690,7 @@ L$ecb_enc_six: movups %xmm6,64(%rsi) movups %xmm7,80(%rsi) jmp L$ecb_ret -#--------------------------- ECB DECRYPT ------------------------------# + .p2align 4 L$ecb_decrypt: cmpq $0x80,%rdx @@ -879,168 +879,6 @@ L$ecb_ret: .byte 0xf3,0xc3 -.globl _aes_hw_ccm64_encrypt_blocks -.private_extern _aes_hw_ccm64_encrypt_blocks - -.p2align 4 -_aes_hw_ccm64_encrypt_blocks: - movl 240(%rcx),%eax - movdqu (%r8),%xmm6 - movdqa L$increment64(%rip),%xmm9 - movdqa L$bswap_mask(%rip),%xmm7 - - shll $4,%eax - movl $16,%r10d - leaq 0(%rcx),%r11 - movdqu (%r9),%xmm3 - movdqa %xmm6,%xmm2 - leaq 32(%rcx,%rax,1),%rcx -.byte 102,15,56,0,247 - subq %rax,%r10 - jmp L$ccm64_enc_outer -.p2align 4 -L$ccm64_enc_outer: - movups (%r11),%xmm0 - movq %r10,%rax - movups (%rdi),%xmm8 - - xorps %xmm0,%xmm2 - movups 16(%r11),%xmm1 - xorps %xmm8,%xmm0 - xorps %xmm0,%xmm3 - movups 32(%r11),%xmm0 - -L$ccm64_enc2_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ccm64_enc2_loop -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - paddq %xmm9,%xmm6 - decq %rdx -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 - - leaq 16(%rdi),%rdi - xorps %xmm2,%xmm8 - movdqa %xmm6,%xmm2 - movups %xmm8,(%rsi) -.byte 102,15,56,0,215 - leaq 16(%rsi),%rsi - jnz L$ccm64_enc_outer - - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - movups %xmm3,(%r9) - pxor %xmm3,%xmm3 - pxor %xmm8,%xmm8 - pxor %xmm6,%xmm6 - .byte 0xf3,0xc3 - -.globl _aes_hw_ccm64_decrypt_blocks -.private_extern _aes_hw_ccm64_decrypt_blocks - -.p2align 4 -_aes_hw_ccm64_decrypt_blocks: - movl 240(%rcx),%eax - movups (%r8),%xmm6 - movdqu (%r9),%xmm3 - movdqa L$increment64(%rip),%xmm9 - movdqa L$bswap_mask(%rip),%xmm7 - - movaps %xmm6,%xmm2 - movl %eax,%r10d - movq %rcx,%r11 -.byte 102,15,56,0,247 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_enc1_5: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_enc1_5 -.byte 102,15,56,221,209 - shll $4,%r10d - movl $16,%eax - movups (%rdi),%xmm8 - paddq %xmm9,%xmm6 - leaq 16(%rdi),%rdi - subq %r10,%rax - leaq 32(%r11,%r10,1),%rcx - movq %rax,%r10 - jmp L$ccm64_dec_outer -.p2align 4 -L$ccm64_dec_outer: - xorps %xmm2,%xmm8 - movdqa %xmm6,%xmm2 - movups %xmm8,(%rsi) - leaq 16(%rsi),%rsi -.byte 102,15,56,0,215 - - subq $1,%rdx - jz L$ccm64_dec_break - - movups (%r11),%xmm0 - movq %r10,%rax - movups 16(%r11),%xmm1 - xorps %xmm0,%xmm8 - xorps %xmm0,%xmm2 - xorps %xmm8,%xmm3 - movups 32(%r11),%xmm0 - jmp L$ccm64_dec2_loop -.p2align 4 -L$ccm64_dec2_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ccm64_dec2_loop - movups (%rdi),%xmm8 - paddq %xmm9,%xmm6 -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 - leaq 16(%rdi),%rdi - jmp L$ccm64_dec_outer - -.p2align 4 -L$ccm64_dec_break: -#xorps %xmm8,%xmm3 - movl 240(%r11),%eax - movups (%r11),%xmm0 - movups 16(%r11),%xmm1 - xorps %xmm0,%xmm8 - leaq 32(%r11),%r11 - xorps %xmm8,%xmm3 -L$oop_enc1_6: -.byte 102,15,56,220,217 - decl %eax - movups (%r11),%xmm1 - leaq 16(%r11),%r11 - jnz L$oop_enc1_6 -.byte 102,15,56,221,217 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - movups %xmm3,(%r9) - pxor %xmm3,%xmm3 - pxor %xmm8,%xmm8 - pxor %xmm6,%xmm6 - .byte 0xf3,0xc3 - .globl _aes_hw_ctr32_encrypt_blocks .private_extern _aes_hw_ctr32_encrypt_blocks @@ -1064,12 +902,12 @@ _aes_hw_ctr32_encrypt_blocks: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -L$oop_enc1_7: +L$oop_enc1_5: .byte 102,15,56,220,209 decl %edx movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_enc1_7 + jnz L$oop_enc1_5 .byte 102,15,56,221,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 @@ -1454,7 +1292,7 @@ L$ctr32_enc_done: movdqa 64(%rsp),%xmm15 .byte 102,68,15,56,221,193 movdqa 80(%rsp),%xmm0 - movups 16-128(%rcx),%xmm1#real 1st-round key + movups 16-128(%rcx),%xmm1 .byte 102,69,15,56,221,202 movups %xmm2,(%rsi) @@ -1626,1896 +1464,73 @@ L$ctr32_epilogue: .byte 0xf3,0xc3 -.globl _aes_hw_xts_encrypt -.private_extern _aes_hw_xts_encrypt +.globl _aes_hw_cbc_encrypt +.private_extern _aes_hw_cbc_encrypt .p2align 4 -_aes_hw_xts_encrypt: - - leaq (%rsp),%r11 +_aes_hw_cbc_encrypt: - pushq %rbp + testq %rdx,%rdx + jz L$cbc_ret - subq $112,%rsp - andq $-16,%rsp - movups (%r9),%xmm2 - movl 240(%r8),%eax movl 240(%rcx),%r10d - movups (%r8),%xmm0 - movups 16(%r8),%xmm1 - leaq 32(%r8),%r8 - xorps %xmm0,%xmm2 -L$oop_enc1_8: + movq %rcx,%r11 + testl %r9d,%r9d + jz L$cbc_decrypt + + movups (%r8),%xmm2 + movl %r10d,%eax + cmpq $16,%rdx + jb L$cbc_enc_tail + subq $16,%rdx + jmp L$cbc_enc_loop +.p2align 4 +L$cbc_enc_loop: + movups (%rdi),%xmm3 + leaq 16(%rdi),%rdi + + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm3 + leaq 32(%rcx),%rcx + xorps %xmm3,%xmm2 +L$oop_enc1_6: .byte 102,15,56,220,209 decl %eax - movups (%r8),%xmm1 - leaq 16(%r8),%r8 - jnz L$oop_enc1_8 + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz L$oop_enc1_6 .byte 102,15,56,221,209 - movups (%rcx),%xmm0 - movq %rcx,%rbp movl %r10d,%eax - shll $4,%r10d - movq %rdx,%r9 - andq $-16,%rdx - - movups 16(%rcx,%r10,1),%xmm1 - - movdqa L$xts_magic(%rip),%xmm8 - movdqa %xmm2,%xmm15 - pshufd $0x5f,%xmm2,%xmm9 - pxor %xmm0,%xmm1 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm10 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm10 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm11 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm11 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm12 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm12 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm13 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm13 - pxor %xmm14,%xmm15 - movdqa %xmm15,%xmm14 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pxor %xmm0,%xmm14 - pxor %xmm9,%xmm15 - movaps %xmm1,96(%rsp) - - subq $96,%rdx - jc L$xts_enc_short - - movl $16+96,%eax - leaq 32(%rbp,%r10,1),%rcx - subq %r10,%rax - movups 16(%rbp),%xmm1 - movq %rax,%r10 - leaq L$xts_magic(%rip),%r8 - jmp L$xts_enc_grandloop - -.p2align 5 -L$xts_enc_grandloop: - movdqu 0(%rdi),%xmm2 - movdqa %xmm0,%xmm8 - movdqu 16(%rdi),%xmm3 - pxor %xmm10,%xmm2 - movdqu 32(%rdi),%xmm4 - pxor %xmm11,%xmm3 -.byte 102,15,56,220,209 - movdqu 48(%rdi),%xmm5 - pxor %xmm12,%xmm4 -.byte 102,15,56,220,217 - movdqu 64(%rdi),%xmm6 - pxor %xmm13,%xmm5 -.byte 102,15,56,220,225 - movdqu 80(%rdi),%xmm7 - pxor %xmm15,%xmm8 - movdqa 96(%rsp),%xmm9 - pxor %xmm14,%xmm6 -.byte 102,15,56,220,233 - movups 32(%rbp),%xmm0 - leaq 96(%rdi),%rdi - pxor %xmm8,%xmm7 - - pxor %xmm9,%xmm10 -.byte 102,15,56,220,241 - pxor %xmm9,%xmm11 - movdqa %xmm10,0(%rsp) -.byte 102,15,56,220,249 - movups 48(%rbp),%xmm1 - pxor %xmm9,%xmm12 - -.byte 102,15,56,220,208 - pxor %xmm9,%xmm13 - movdqa %xmm11,16(%rsp) -.byte 102,15,56,220,216 - pxor %xmm9,%xmm14 - movdqa %xmm12,32(%rsp) -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - pxor %xmm9,%xmm8 - movdqa %xmm14,64(%rsp) -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups 64(%rbp),%xmm0 - movdqa %xmm8,80(%rsp) - pshufd $0x5f,%xmm15,%xmm9 - jmp L$xts_enc_loop6 -.p2align 5 -L$xts_enc_loop6: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups -64(%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups -80(%rcx,%rax,1),%xmm0 - jnz L$xts_enc_loop6 - - movdqa (%r8),%xmm8 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 -.byte 102,15,56,220,209 - paddq %xmm15,%xmm15 - psrad $31,%xmm14 -.byte 102,15,56,220,217 - pand %xmm8,%xmm14 - movups (%rbp),%xmm10 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 - pxor %xmm14,%xmm15 - movaps %xmm10,%xmm11 -.byte 102,15,56,220,249 - movups -64(%rcx),%xmm1 - - movdqa %xmm9,%xmm14 -.byte 102,15,56,220,208 - paddd %xmm9,%xmm9 - pxor %xmm15,%xmm10 -.byte 102,15,56,220,216 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - pand %xmm8,%xmm14 - movaps %xmm11,%xmm12 -.byte 102,15,56,220,240 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 -.byte 102,15,56,220,248 - movups -48(%rcx),%xmm0 - - paddd %xmm9,%xmm9 -.byte 102,15,56,220,209 - pxor %xmm15,%xmm11 - psrad $31,%xmm14 -.byte 102,15,56,220,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movdqa %xmm13,48(%rsp) - pxor %xmm14,%xmm15 -.byte 102,15,56,220,241 - movaps %xmm12,%xmm13 - movdqa %xmm9,%xmm14 -.byte 102,15,56,220,249 - movups -32(%rcx),%xmm1 - - paddd %xmm9,%xmm9 -.byte 102,15,56,220,208 - pxor %xmm15,%xmm12 - psrad $31,%xmm14 -.byte 102,15,56,220,216 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 - pxor %xmm14,%xmm15 - movaps %xmm13,%xmm14 -.byte 102,15,56,220,248 - - movdqa %xmm9,%xmm0 - paddd %xmm9,%xmm9 -.byte 102,15,56,220,209 - pxor %xmm15,%xmm13 - psrad $31,%xmm0 -.byte 102,15,56,220,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm0 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - pxor %xmm0,%xmm15 - movups (%rbp),%xmm0 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups 16(%rbp),%xmm1 - - pxor %xmm15,%xmm14 -.byte 102,15,56,221,84,36,0 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 -.byte 102,15,56,221,92,36,16 -.byte 102,15,56,221,100,36,32 - pand %xmm8,%xmm9 - movq %r10,%rax -.byte 102,15,56,221,108,36,48 -.byte 102,15,56,221,116,36,64 -.byte 102,15,56,221,124,36,80 - pxor %xmm9,%xmm15 + movq %r11,%rcx + movups %xmm2,0(%rsi) + leaq 16(%rsi),%rsi + subq $16,%rdx + jnc L$cbc_enc_loop + addq $16,%rdx + jnz L$cbc_enc_tail + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm2,(%r8) + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + jmp L$cbc_ret - leaq 96(%rsi),%rsi - movups %xmm2,-96(%rsi) - movups %xmm3,-80(%rsi) - movups %xmm4,-64(%rsi) - movups %xmm5,-48(%rsi) - movups %xmm6,-32(%rsi) - movups %xmm7,-16(%rsi) - subq $96,%rdx - jnc L$xts_enc_grandloop - - movl $16+96,%eax - subl %r10d,%eax - movq %rbp,%rcx - shrl $4,%eax +L$cbc_enc_tail: + movq %rdx,%rcx + xchgq %rdi,%rsi +.long 0x9066A4F3 + movl $16,%ecx + subq %rdx,%rcx + xorl %eax,%eax +.long 0x9066AAF3 + leaq -16(%rdi),%rdi + movl %r10d,%eax + movq %rdi,%rsi + movq %r11,%rcx + xorq %rdx,%rdx + jmp L$cbc_enc_loop -L$xts_enc_short: - - movl %eax,%r10d - pxor %xmm0,%xmm10 - addq $96,%rdx - jz L$xts_enc_done - - pxor %xmm0,%xmm11 - cmpq $0x20,%rdx - jb L$xts_enc_one - pxor %xmm0,%xmm12 - je L$xts_enc_two - - pxor %xmm0,%xmm13 - cmpq $0x40,%rdx - jb L$xts_enc_three - pxor %xmm0,%xmm14 - je L$xts_enc_four - - movdqu (%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 - pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 - leaq 80(%rdi),%rdi - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm6 - pxor %xmm7,%xmm7 - - call _aesni_encrypt6 - - xorps %xmm10,%xmm2 - movdqa %xmm15,%xmm10 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - xorps %xmm14,%xmm6 - movdqu %xmm4,32(%rsi) - movdqu %xmm5,48(%rsi) - movdqu %xmm6,64(%rsi) - leaq 80(%rsi),%rsi - jmp L$xts_enc_done - -.p2align 4 -L$xts_enc_one: - movups (%rdi),%xmm2 - leaq 16(%rdi),%rdi - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_enc1_9: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_enc1_9 -.byte 102,15,56,221,209 - xorps %xmm10,%xmm2 - movdqa %xmm11,%xmm10 - movups %xmm2,(%rsi) - leaq 16(%rsi),%rsi - jmp L$xts_enc_done - -.p2align 4 -L$xts_enc_two: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - leaq 32(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - - call _aesni_encrypt2 - - xorps %xmm10,%xmm2 - movdqa %xmm12,%xmm10 - xorps %xmm11,%xmm3 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - leaq 32(%rsi),%rsi - jmp L$xts_enc_done - -.p2align 4 -L$xts_enc_three: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - leaq 48(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - - call _aesni_encrypt3 - - xorps %xmm10,%xmm2 - movdqa %xmm13,%xmm10 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - leaq 48(%rsi),%rsi - jmp L$xts_enc_done - -.p2align 4 -L$xts_enc_four: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - xorps %xmm10,%xmm2 - movups 48(%rdi),%xmm5 - leaq 64(%rdi),%rdi - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - xorps %xmm13,%xmm5 - - call _aesni_encrypt4 - - pxor %xmm10,%xmm2 - movdqa %xmm14,%xmm10 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - pxor %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - movdqu %xmm4,32(%rsi) - movdqu %xmm5,48(%rsi) - leaq 64(%rsi),%rsi - jmp L$xts_enc_done - -.p2align 4 -L$xts_enc_done: - andq $15,%r9 - jz L$xts_enc_ret - movq %r9,%rdx - -L$xts_enc_steal: - movzbl (%rdi),%eax - movzbl -16(%rsi),%ecx - leaq 1(%rdi),%rdi - movb %al,-16(%rsi) - movb %cl,0(%rsi) - leaq 1(%rsi),%rsi - subq $1,%rdx - jnz L$xts_enc_steal - - subq %r9,%rsi - movq %rbp,%rcx - movl %r10d,%eax - - movups -16(%rsi),%xmm2 - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_enc1_10: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_enc1_10 -.byte 102,15,56,221,209 - xorps %xmm10,%xmm2 - movups %xmm2,-16(%rsi) - -L$xts_enc_ret: - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - movaps %xmm0,0(%rsp) - pxor %xmm8,%xmm8 - movaps %xmm0,16(%rsp) - pxor %xmm9,%xmm9 - movaps %xmm0,32(%rsp) - pxor %xmm10,%xmm10 - movaps %xmm0,48(%rsp) - pxor %xmm11,%xmm11 - movaps %xmm0,64(%rsp) - pxor %xmm12,%xmm12 - movaps %xmm0,80(%rsp) - pxor %xmm13,%xmm13 - movaps %xmm0,96(%rsp) - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - movq -8(%r11),%rbp - - leaq (%r11),%rsp - -L$xts_enc_epilogue: - .byte 0xf3,0xc3 - - -.globl _aes_hw_xts_decrypt -.private_extern _aes_hw_xts_decrypt - -.p2align 4 -_aes_hw_xts_decrypt: - - leaq (%rsp),%r11 - - pushq %rbp - - subq $112,%rsp - andq $-16,%rsp - movups (%r9),%xmm2 - movl 240(%r8),%eax - movl 240(%rcx),%r10d - movups (%r8),%xmm0 - movups 16(%r8),%xmm1 - leaq 32(%r8),%r8 - xorps %xmm0,%xmm2 -L$oop_enc1_11: -.byte 102,15,56,220,209 - decl %eax - movups (%r8),%xmm1 - leaq 16(%r8),%r8 - jnz L$oop_enc1_11 -.byte 102,15,56,221,209 - xorl %eax,%eax - testq $15,%rdx - setnz %al - shlq $4,%rax - subq %rax,%rdx - - movups (%rcx),%xmm0 - movq %rcx,%rbp - movl %r10d,%eax - shll $4,%r10d - movq %rdx,%r9 - andq $-16,%rdx - - movups 16(%rcx,%r10,1),%xmm1 - - movdqa L$xts_magic(%rip),%xmm8 - movdqa %xmm2,%xmm15 - pshufd $0x5f,%xmm2,%xmm9 - pxor %xmm0,%xmm1 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm10 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm10 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm11 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm11 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm12 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm12 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm13 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm13 - pxor %xmm14,%xmm15 - movdqa %xmm15,%xmm14 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pxor %xmm0,%xmm14 - pxor %xmm9,%xmm15 - movaps %xmm1,96(%rsp) - - subq $96,%rdx - jc L$xts_dec_short - - movl $16+96,%eax - leaq 32(%rbp,%r10,1),%rcx - subq %r10,%rax - movups 16(%rbp),%xmm1 - movq %rax,%r10 - leaq L$xts_magic(%rip),%r8 - jmp L$xts_dec_grandloop - -.p2align 5 -L$xts_dec_grandloop: - movdqu 0(%rdi),%xmm2 - movdqa %xmm0,%xmm8 - movdqu 16(%rdi),%xmm3 - pxor %xmm10,%xmm2 - movdqu 32(%rdi),%xmm4 - pxor %xmm11,%xmm3 -.byte 102,15,56,222,209 - movdqu 48(%rdi),%xmm5 - pxor %xmm12,%xmm4 -.byte 102,15,56,222,217 - movdqu 64(%rdi),%xmm6 - pxor %xmm13,%xmm5 -.byte 102,15,56,222,225 - movdqu 80(%rdi),%xmm7 - pxor %xmm15,%xmm8 - movdqa 96(%rsp),%xmm9 - pxor %xmm14,%xmm6 -.byte 102,15,56,222,233 - movups 32(%rbp),%xmm0 - leaq 96(%rdi),%rdi - pxor %xmm8,%xmm7 - - pxor %xmm9,%xmm10 -.byte 102,15,56,222,241 - pxor %xmm9,%xmm11 - movdqa %xmm10,0(%rsp) -.byte 102,15,56,222,249 - movups 48(%rbp),%xmm1 - pxor %xmm9,%xmm12 - -.byte 102,15,56,222,208 - pxor %xmm9,%xmm13 - movdqa %xmm11,16(%rsp) -.byte 102,15,56,222,216 - pxor %xmm9,%xmm14 - movdqa %xmm12,32(%rsp) -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - pxor %xmm9,%xmm8 - movdqa %xmm14,64(%rsp) -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups 64(%rbp),%xmm0 - movdqa %xmm8,80(%rsp) - pshufd $0x5f,%xmm15,%xmm9 - jmp L$xts_dec_loop6 -.p2align 5 -L$xts_dec_loop6: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups -64(%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups -80(%rcx,%rax,1),%xmm0 - jnz L$xts_dec_loop6 - - movdqa (%r8),%xmm8 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 -.byte 102,15,56,222,209 - paddq %xmm15,%xmm15 - psrad $31,%xmm14 -.byte 102,15,56,222,217 - pand %xmm8,%xmm14 - movups (%rbp),%xmm10 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 - pxor %xmm14,%xmm15 - movaps %xmm10,%xmm11 -.byte 102,15,56,222,249 - movups -64(%rcx),%xmm1 - - movdqa %xmm9,%xmm14 -.byte 102,15,56,222,208 - paddd %xmm9,%xmm9 - pxor %xmm15,%xmm10 -.byte 102,15,56,222,216 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - pand %xmm8,%xmm14 - movaps %xmm11,%xmm12 -.byte 102,15,56,222,240 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 -.byte 102,15,56,222,248 - movups -48(%rcx),%xmm0 - - paddd %xmm9,%xmm9 -.byte 102,15,56,222,209 - pxor %xmm15,%xmm11 - psrad $31,%xmm14 -.byte 102,15,56,222,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movdqa %xmm13,48(%rsp) - pxor %xmm14,%xmm15 -.byte 102,15,56,222,241 - movaps %xmm12,%xmm13 - movdqa %xmm9,%xmm14 -.byte 102,15,56,222,249 - movups -32(%rcx),%xmm1 - - paddd %xmm9,%xmm9 -.byte 102,15,56,222,208 - pxor %xmm15,%xmm12 - psrad $31,%xmm14 -.byte 102,15,56,222,216 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 - pxor %xmm14,%xmm15 - movaps %xmm13,%xmm14 -.byte 102,15,56,222,248 - - movdqa %xmm9,%xmm0 - paddd %xmm9,%xmm9 -.byte 102,15,56,222,209 - pxor %xmm15,%xmm13 - psrad $31,%xmm0 -.byte 102,15,56,222,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm0 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - pxor %xmm0,%xmm15 - movups (%rbp),%xmm0 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups 16(%rbp),%xmm1 - - pxor %xmm15,%xmm14 -.byte 102,15,56,223,84,36,0 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 -.byte 102,15,56,223,92,36,16 -.byte 102,15,56,223,100,36,32 - pand %xmm8,%xmm9 - movq %r10,%rax -.byte 102,15,56,223,108,36,48 -.byte 102,15,56,223,116,36,64 -.byte 102,15,56,223,124,36,80 - pxor %xmm9,%xmm15 - - leaq 96(%rsi),%rsi - movups %xmm2,-96(%rsi) - movups %xmm3,-80(%rsi) - movups %xmm4,-64(%rsi) - movups %xmm5,-48(%rsi) - movups %xmm6,-32(%rsi) - movups %xmm7,-16(%rsi) - subq $96,%rdx - jnc L$xts_dec_grandloop - - movl $16+96,%eax - subl %r10d,%eax - movq %rbp,%rcx - shrl $4,%eax - -L$xts_dec_short: - - movl %eax,%r10d - pxor %xmm0,%xmm10 - pxor %xmm0,%xmm11 - addq $96,%rdx - jz L$xts_dec_done - - pxor %xmm0,%xmm12 - cmpq $0x20,%rdx - jb L$xts_dec_one - pxor %xmm0,%xmm13 - je L$xts_dec_two - - pxor %xmm0,%xmm14 - cmpq $0x40,%rdx - jb L$xts_dec_three - je L$xts_dec_four - - movdqu (%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 - pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 - leaq 80(%rdi),%rdi - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm6 - - call _aesni_decrypt6 - - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - xorps %xmm14,%xmm6 - movdqu %xmm4,32(%rsi) - pxor %xmm14,%xmm14 - movdqu %xmm5,48(%rsi) - pcmpgtd %xmm15,%xmm14 - movdqu %xmm6,64(%rsi) - leaq 80(%rsi),%rsi - pshufd $0x13,%xmm14,%xmm11 - andq $15,%r9 - jz L$xts_dec_ret - - movdqa %xmm15,%xmm10 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm11 - pxor %xmm15,%xmm11 - jmp L$xts_dec_done2 - -.p2align 4 -L$xts_dec_one: - movups (%rdi),%xmm2 - leaq 16(%rdi),%rdi - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_dec1_12: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_dec1_12 -.byte 102,15,56,223,209 - xorps %xmm10,%xmm2 - movdqa %xmm11,%xmm10 - movups %xmm2,(%rsi) - movdqa %xmm12,%xmm11 - leaq 16(%rsi),%rsi - jmp L$xts_dec_done - -.p2align 4 -L$xts_dec_two: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - leaq 32(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - - call _aesni_decrypt2 - - xorps %xmm10,%xmm2 - movdqa %xmm12,%xmm10 - xorps %xmm11,%xmm3 - movdqa %xmm13,%xmm11 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - leaq 32(%rsi),%rsi - jmp L$xts_dec_done - -.p2align 4 -L$xts_dec_three: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - leaq 48(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - - call _aesni_decrypt3 - - xorps %xmm10,%xmm2 - movdqa %xmm13,%xmm10 - xorps %xmm11,%xmm3 - movdqa %xmm14,%xmm11 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - leaq 48(%rsi),%rsi - jmp L$xts_dec_done - -.p2align 4 -L$xts_dec_four: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - xorps %xmm10,%xmm2 - movups 48(%rdi),%xmm5 - leaq 64(%rdi),%rdi - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - xorps %xmm13,%xmm5 - - call _aesni_decrypt4 - - pxor %xmm10,%xmm2 - movdqa %xmm14,%xmm10 - pxor %xmm11,%xmm3 - movdqa %xmm15,%xmm11 - pxor %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - pxor %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - movdqu %xmm4,32(%rsi) - movdqu %xmm5,48(%rsi) - leaq 64(%rsi),%rsi - jmp L$xts_dec_done - -.p2align 4 -L$xts_dec_done: - andq $15,%r9 - jz L$xts_dec_ret -L$xts_dec_done2: - movq %r9,%rdx - movq %rbp,%rcx - movl %r10d,%eax - - movups (%rdi),%xmm2 - xorps %xmm11,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_dec1_13: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_dec1_13 -.byte 102,15,56,223,209 - xorps %xmm11,%xmm2 - movups %xmm2,(%rsi) - -L$xts_dec_steal: - movzbl 16(%rdi),%eax - movzbl (%rsi),%ecx - leaq 1(%rdi),%rdi - movb %al,(%rsi) - movb %cl,16(%rsi) - leaq 1(%rsi),%rsi - subq $1,%rdx - jnz L$xts_dec_steal - - subq %r9,%rsi - movq %rbp,%rcx - movl %r10d,%eax - - movups (%rsi),%xmm2 - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_dec1_14: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_dec1_14 -.byte 102,15,56,223,209 - xorps %xmm10,%xmm2 - movups %xmm2,(%rsi) - -L$xts_dec_ret: - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - movaps %xmm0,0(%rsp) - pxor %xmm8,%xmm8 - movaps %xmm0,16(%rsp) - pxor %xmm9,%xmm9 - movaps %xmm0,32(%rsp) - pxor %xmm10,%xmm10 - movaps %xmm0,48(%rsp) - pxor %xmm11,%xmm11 - movaps %xmm0,64(%rsp) - pxor %xmm12,%xmm12 - movaps %xmm0,80(%rsp) - pxor %xmm13,%xmm13 - movaps %xmm0,96(%rsp) - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - movq -8(%r11),%rbp - - leaq (%r11),%rsp - -L$xts_dec_epilogue: - .byte 0xf3,0xc3 - - -.globl _aes_hw_ocb_encrypt -.private_extern _aes_hw_ocb_encrypt - -.p2align 5 -_aes_hw_ocb_encrypt: - - leaq (%rsp),%rax - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - movq 8(%rax),%rbx - movq 8+8(%rax),%rbp - - movl 240(%rcx),%r10d - movq %rcx,%r11 - shll $4,%r10d - movups (%rcx),%xmm9 - movups 16(%rcx,%r10,1),%xmm1 - - movdqu (%r9),%xmm15 - pxor %xmm1,%xmm9 - pxor %xmm1,%xmm15 - - movl $16+32,%eax - leaq 32(%r11,%r10,1),%rcx - movups 16(%r11),%xmm1 - subq %r10,%rax - movq %rax,%r10 - - movdqu (%rbx),%xmm10 - movdqu (%rbp),%xmm8 - - testq $1,%r8 - jnz L$ocb_enc_odd - - bsfq %r8,%r12 - addq $1,%r8 - shlq $4,%r12 - movdqu (%rbx,%r12,1),%xmm7 - movdqu (%rdi),%xmm2 - leaq 16(%rdi),%rdi - - call __ocb_encrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,(%rsi) - leaq 16(%rsi),%rsi - subq $1,%rdx - jz L$ocb_enc_done - -L$ocb_enc_odd: - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - leaq 6(%r8),%r8 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - shlq $4,%r12 - shlq $4,%r13 - shlq $4,%r14 - - subq $6,%rdx - jc L$ocb_enc_short - jmp L$ocb_enc_grandloop - -.p2align 5 -L$ocb_enc_grandloop: - movdqu 0(%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - movdqu 48(%rdi),%xmm5 - movdqu 64(%rdi),%xmm6 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi - - call __ocb_encrypt6 - - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) - leaq 96(%rsi),%rsi - subq $6,%rdx - jnc L$ocb_enc_grandloop - -L$ocb_enc_short: - addq $6,%rdx - jz L$ocb_enc_done - - movdqu 0(%rdi),%xmm2 - cmpq $2,%rdx - jb L$ocb_enc_one - movdqu 16(%rdi),%xmm3 - je L$ocb_enc_two - - movdqu 32(%rdi),%xmm4 - cmpq $4,%rdx - jb L$ocb_enc_three - movdqu 48(%rdi),%xmm5 - je L$ocb_enc_four - - movdqu 64(%rdi),%xmm6 - pxor %xmm7,%xmm7 - - call __ocb_encrypt6 - - movdqa %xmm14,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - - jmp L$ocb_enc_done - -.p2align 4 -L$ocb_enc_one: - movdqa %xmm10,%xmm7 - - call __ocb_encrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,0(%rsi) - jmp L$ocb_enc_done - -.p2align 4 -L$ocb_enc_two: - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - - call __ocb_encrypt4 - - movdqa %xmm11,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - - jmp L$ocb_enc_done - -.p2align 4 -L$ocb_enc_three: - pxor %xmm5,%xmm5 - - call __ocb_encrypt4 - - movdqa %xmm12,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - - jmp L$ocb_enc_done - -.p2align 4 -L$ocb_enc_four: - call __ocb_encrypt4 - - movdqa %xmm13,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - -L$ocb_enc_done: - pxor %xmm0,%xmm15 - movdqu %xmm8,(%rbp) - movdqu %xmm15,(%r9) - - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - pxor %xmm8,%xmm8 - pxor %xmm9,%xmm9 - pxor %xmm10,%xmm10 - pxor %xmm11,%xmm11 - pxor %xmm12,%xmm12 - pxor %xmm13,%xmm13 - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - leaq 40(%rsp),%rax - - movq -40(%rax),%r14 - - movq -32(%rax),%r13 - - movq -24(%rax),%r12 - - movq -16(%rax),%rbp - - movq -8(%rax),%rbx - - leaq (%rax),%rsp - -L$ocb_enc_epilogue: - .byte 0xf3,0xc3 - - - - -.p2align 5 -__ocb_encrypt6: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - movdqa %xmm10,%xmm14 - pxor %xmm15,%xmm10 - movdqu (%rbx,%r14,1),%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm2,%xmm8 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm3,%xmm8 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm4,%xmm8 - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm14 - pxor %xmm5,%xmm8 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm15 - pxor %xmm6,%xmm8 - pxor %xmm14,%xmm6 - pxor %xmm7,%xmm8 - pxor %xmm15,%xmm7 - movups 32(%r11),%xmm0 - - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - addq $6,%r8 - pxor %xmm9,%xmm10 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 -.byte 102,15,56,220,241 - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm14 -.byte 102,15,56,220,249 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm15 - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups 64(%r11),%xmm0 - shlq $4,%r12 - shlq $4,%r13 - jmp L$ocb_enc_loop6 - -.p2align 5 -L$ocb_enc_loop6: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ocb_enc_loop6 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups 16(%r11),%xmm1 - shlq $4,%r14 - -.byte 102,65,15,56,221,210 - movdqu (%rbx),%xmm10 - movq %r10,%rax -.byte 102,65,15,56,221,219 -.byte 102,65,15,56,221,228 -.byte 102,65,15,56,221,237 -.byte 102,65,15,56,221,246 -.byte 102,65,15,56,221,255 - .byte 0xf3,0xc3 - - - -.p2align 5 -__ocb_encrypt4: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - pxor %xmm15,%xmm10 - pxor %xmm10,%xmm11 - pxor %xmm2,%xmm8 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm3,%xmm8 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm4,%xmm8 - pxor %xmm12,%xmm4 - pxor %xmm5,%xmm8 - pxor %xmm13,%xmm5 - movups 32(%r11),%xmm0 - - pxor %xmm9,%xmm10 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 - pxor %xmm9,%xmm13 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups 48(%r11),%xmm1 - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movups 64(%r11),%xmm0 - jmp L$ocb_enc_loop4 - -.p2align 5 -L$ocb_enc_loop4: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ocb_enc_loop4 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,65,15,56,221,210 -.byte 102,65,15,56,221,219 -.byte 102,65,15,56,221,228 -.byte 102,65,15,56,221,237 - .byte 0xf3,0xc3 - - - -.p2align 5 -__ocb_encrypt1: - pxor %xmm15,%xmm7 - pxor %xmm9,%xmm7 - pxor %xmm2,%xmm8 - pxor %xmm7,%xmm2 - movups 32(%r11),%xmm0 - -.byte 102,15,56,220,209 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm7 - -.byte 102,15,56,220,208 - movups 64(%r11),%xmm0 - jmp L$ocb_enc_loop1 - -.p2align 5 -L$ocb_enc_loop1: -.byte 102,15,56,220,209 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ocb_enc_loop1 - -.byte 102,15,56,220,209 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,15,56,221,215 - .byte 0xf3,0xc3 - - -.globl _aes_hw_ocb_decrypt -.private_extern _aes_hw_ocb_decrypt - -.p2align 5 -_aes_hw_ocb_decrypt: - - leaq (%rsp),%rax - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - movq 8(%rax),%rbx - movq 8+8(%rax),%rbp - - movl 240(%rcx),%r10d - movq %rcx,%r11 - shll $4,%r10d - movups (%rcx),%xmm9 - movups 16(%rcx,%r10,1),%xmm1 - - movdqu (%r9),%xmm15 - pxor %xmm1,%xmm9 - pxor %xmm1,%xmm15 - - movl $16+32,%eax - leaq 32(%r11,%r10,1),%rcx - movups 16(%r11),%xmm1 - subq %r10,%rax - movq %rax,%r10 - - movdqu (%rbx),%xmm10 - movdqu (%rbp),%xmm8 - - testq $1,%r8 - jnz L$ocb_dec_odd - - bsfq %r8,%r12 - addq $1,%r8 - shlq $4,%r12 - movdqu (%rbx,%r12,1),%xmm7 - movdqu (%rdi),%xmm2 - leaq 16(%rdi),%rdi - - call __ocb_decrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,(%rsi) - xorps %xmm2,%xmm8 - leaq 16(%rsi),%rsi - subq $1,%rdx - jz L$ocb_dec_done - -L$ocb_dec_odd: - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - leaq 6(%r8),%r8 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - shlq $4,%r12 - shlq $4,%r13 - shlq $4,%r14 - - subq $6,%rdx - jc L$ocb_dec_short - jmp L$ocb_dec_grandloop - -.p2align 5 -L$ocb_dec_grandloop: - movdqu 0(%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - movdqu 48(%rdi),%xmm5 - movdqu 64(%rdi),%xmm6 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi - - call __ocb_decrypt6 - - movups %xmm2,0(%rsi) - pxor %xmm2,%xmm8 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm8 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm8 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm8 - movups %xmm6,64(%rsi) - pxor %xmm6,%xmm8 - movups %xmm7,80(%rsi) - pxor %xmm7,%xmm8 - leaq 96(%rsi),%rsi - subq $6,%rdx - jnc L$ocb_dec_grandloop - -L$ocb_dec_short: - addq $6,%rdx - jz L$ocb_dec_done - - movdqu 0(%rdi),%xmm2 - cmpq $2,%rdx - jb L$ocb_dec_one - movdqu 16(%rdi),%xmm3 - je L$ocb_dec_two - - movdqu 32(%rdi),%xmm4 - cmpq $4,%rdx - jb L$ocb_dec_three - movdqu 48(%rdi),%xmm5 - je L$ocb_dec_four - - movdqu 64(%rdi),%xmm6 - pxor %xmm7,%xmm7 - - call __ocb_decrypt6 - - movdqa %xmm14,%xmm15 - movups %xmm2,0(%rsi) - pxor %xmm2,%xmm8 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm8 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm8 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm8 - movups %xmm6,64(%rsi) - pxor %xmm6,%xmm8 - - jmp L$ocb_dec_done - -.p2align 4 -L$ocb_dec_one: - movdqa %xmm10,%xmm7 - - call __ocb_decrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,0(%rsi) - xorps %xmm2,%xmm8 - jmp L$ocb_dec_done - -.p2align 4 -L$ocb_dec_two: - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - - call __ocb_decrypt4 - - movdqa %xmm11,%xmm15 - movups %xmm2,0(%rsi) - xorps %xmm2,%xmm8 - movups %xmm3,16(%rsi) - xorps %xmm3,%xmm8 - - jmp L$ocb_dec_done - -.p2align 4 -L$ocb_dec_three: - pxor %xmm5,%xmm5 - - call __ocb_decrypt4 - - movdqa %xmm12,%xmm15 - movups %xmm2,0(%rsi) - xorps %xmm2,%xmm8 - movups %xmm3,16(%rsi) - xorps %xmm3,%xmm8 - movups %xmm4,32(%rsi) - xorps %xmm4,%xmm8 - - jmp L$ocb_dec_done - -.p2align 4 -L$ocb_dec_four: - call __ocb_decrypt4 - - movdqa %xmm13,%xmm15 - movups %xmm2,0(%rsi) - pxor %xmm2,%xmm8 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm8 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm8 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm8 - -L$ocb_dec_done: - pxor %xmm0,%xmm15 - movdqu %xmm8,(%rbp) - movdqu %xmm15,(%r9) - - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - pxor %xmm8,%xmm8 - pxor %xmm9,%xmm9 - pxor %xmm10,%xmm10 - pxor %xmm11,%xmm11 - pxor %xmm12,%xmm12 - pxor %xmm13,%xmm13 - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - leaq 40(%rsp),%rax - - movq -40(%rax),%r14 - - movq -32(%rax),%r13 - - movq -24(%rax),%r12 - - movq -16(%rax),%rbp - - movq -8(%rax),%rbx - - leaq (%rax),%rsp - -L$ocb_dec_epilogue: - .byte 0xf3,0xc3 - - - - -.p2align 5 -__ocb_decrypt6: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - movdqa %xmm10,%xmm14 - pxor %xmm15,%xmm10 - movdqu (%rbx,%r14,1),%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm14 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm15 - pxor %xmm14,%xmm6 - pxor %xmm15,%xmm7 - movups 32(%r11),%xmm0 - - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - addq $6,%r8 - pxor %xmm9,%xmm10 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 -.byte 102,15,56,222,241 - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm14 -.byte 102,15,56,222,249 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm15 - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups 64(%r11),%xmm0 - shlq $4,%r12 - shlq $4,%r13 - jmp L$ocb_dec_loop6 - -.p2align 5 -L$ocb_dec_loop6: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ocb_dec_loop6 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups 16(%r11),%xmm1 - shlq $4,%r14 - -.byte 102,65,15,56,223,210 - movdqu (%rbx),%xmm10 - movq %r10,%rax -.byte 102,65,15,56,223,219 -.byte 102,65,15,56,223,228 -.byte 102,65,15,56,223,237 -.byte 102,65,15,56,223,246 -.byte 102,65,15,56,223,255 - .byte 0xf3,0xc3 - - - -.p2align 5 -__ocb_decrypt4: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - pxor %xmm15,%xmm10 - pxor %xmm10,%xmm11 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm5 - movups 32(%r11),%xmm0 - - pxor %xmm9,%xmm10 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 - pxor %xmm9,%xmm13 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups 48(%r11),%xmm1 - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - movups 64(%r11),%xmm0 - jmp L$ocb_dec_loop4 - -.p2align 5 -L$ocb_dec_loop4: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ocb_dec_loop4 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,65,15,56,223,210 -.byte 102,65,15,56,223,219 -.byte 102,65,15,56,223,228 -.byte 102,65,15,56,223,237 - .byte 0xf3,0xc3 - - - -.p2align 5 -__ocb_decrypt1: - pxor %xmm15,%xmm7 - pxor %xmm9,%xmm7 - pxor %xmm7,%xmm2 - movups 32(%r11),%xmm0 - -.byte 102,15,56,222,209 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm7 - -.byte 102,15,56,222,208 - movups 64(%r11),%xmm0 - jmp L$ocb_dec_loop1 - -.p2align 5 -L$ocb_dec_loop1: -.byte 102,15,56,222,209 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ocb_dec_loop1 - -.byte 102,15,56,222,209 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,15,56,223,215 - .byte 0xf3,0xc3 - -.globl _aes_hw_cbc_encrypt -.private_extern _aes_hw_cbc_encrypt - -.p2align 4 -_aes_hw_cbc_encrypt: - - testq %rdx,%rdx - jz L$cbc_ret - - movl 240(%rcx),%r10d - movq %rcx,%r11 - testl %r9d,%r9d - jz L$cbc_decrypt -#--------------------------- CBC ENCRYPT ------------------------------# - movups (%r8),%xmm2 - movl %r10d,%eax - cmpq $16,%rdx - jb L$cbc_enc_tail - subq $16,%rdx - jmp L$cbc_enc_loop -.p2align 4 -L$cbc_enc_loop: - movups (%rdi),%xmm3 - leaq 16(%rdi),%rdi -#xorps %xmm3,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm3 - leaq 32(%rcx),%rcx - xorps %xmm3,%xmm2 -L$oop_enc1_15: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_enc1_15 -.byte 102,15,56,221,209 - movl %r10d,%eax - movq %r11,%rcx - movups %xmm2,0(%rsi) - leaq 16(%rsi),%rsi - subq $16,%rdx - jnc L$cbc_enc_loop - addq $16,%rdx - jnz L$cbc_enc_tail - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movups %xmm2,(%r8) - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - jmp L$cbc_ret - -L$cbc_enc_tail: - movq %rdx,%rcx - xchgq %rdi,%rsi -.long 0x9066A4F3 - movl $16,%ecx - subq %rdx,%rcx - xorl %eax,%eax -.long 0x9066AAF3 - leaq -16(%rdi),%rdi - movl %r10d,%eax - movq %rdi,%rsi - movq %r11,%rcx - xorq %rdx,%rdx - jmp L$cbc_enc_loop -#--------------------------- CBC DECRYPT ------------------------------# .p2align 4 L$cbc_decrypt: cmpq $16,%rdx @@ -3530,12 +1545,12 @@ L$cbc_decrypt: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -L$oop_dec1_16: +L$oop_dec1_7: .byte 102,15,56,222,209 decl %r10d movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_dec1_16 + jnz L$oop_dec1_7 .byte 102,15,56,223,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 @@ -3948,12 +1963,12 @@ L$cbc_dec_one: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -L$oop_dec1_17: +L$oop_dec1_8: .byte 102,15,56,222,209 decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_dec1_17 + jnz L$oop_dec1_8 .byte 102,15,56,223,209 xorps %xmm10,%xmm2 movaps %xmm11,%xmm10 diff --git a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S index baf70cb4163e65..5a65960d8a470b 100644 --- a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S +++ b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S @@ -219,7 +219,7 @@ L$enc_sbox: pxor %xmm13,%xmm8 pxor %xmm14,%xmm7 -#Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 + @@ -690,7 +690,7 @@ L$dec_sbox: pxor %xmm13,%xmm8 pxor %xmm14,%xmm7 -#Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 + @@ -1077,7 +1077,7 @@ L$key_loop: jnz L$key_loop movdqa 80(%r11),%xmm7 -#movdqa %xmm6, (%rax) + .byte 0xf3,0xc3 diff --git a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-x86_64.S b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-x86_64.S index 2bdeef2c0b2bbf..d7dcf5d61fddff 100644 --- a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-x86_64.S +++ b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-x86_64.S @@ -718,7 +718,7 @@ L$_init_clmul: pshufd $255,%xmm2,%xmm4 movdqa %xmm2,%xmm3 psllq $1,%xmm2 - pxor %xmm5,%xmm5# + pxor %xmm5,%xmm5 psrlq $63,%xmm3 pcmpgtd %xmm4,%xmm5 pslldq $8,%xmm3 @@ -732,43 +732,43 @@ L$_init_clmul: pshufd $78,%xmm2,%xmm6 movdqa %xmm2,%xmm0 pxor %xmm2,%xmm6 - movdqa %xmm0,%xmm1# + movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,222,0 - pxor %xmm0,%xmm3# - pxor %xmm1,%xmm3# + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 - movdqa %xmm3,%xmm4# + movdqa %xmm3,%xmm4 psrldq $8,%xmm3 - pslldq $8,%xmm4# + pslldq $8,%xmm4 pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0# + pxor %xmm4,%xmm0 - movdqa %xmm0,%xmm4# + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 psllq $1,%xmm0 - pxor %xmm3,%xmm0# - psllq $57,%xmm0# - movdqa %xmm0,%xmm3# + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm3# + psrldq $8,%xmm3 pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1# + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm1# + pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 - pxor %xmm4,%xmm0# - psrlq $1,%xmm0# - pxor %xmm1,%xmm0# + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 pshufd $78,%xmm2,%xmm3 pshufd $78,%xmm0,%xmm4 pxor %xmm2,%xmm3 @@ -777,81 +777,81 @@ L$_init_clmul: movdqu %xmm0,16(%rdi) .byte 102,15,58,15,227,8 movdqu %xmm4,32(%rdi) - movdqa %xmm0,%xmm1# + movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,222,0 - pxor %xmm0,%xmm3# - pxor %xmm1,%xmm3# + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 - movdqa %xmm3,%xmm4# + movdqa %xmm3,%xmm4 psrldq $8,%xmm3 - pslldq $8,%xmm4# + pslldq $8,%xmm4 pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0# + pxor %xmm4,%xmm0 - movdqa %xmm0,%xmm4# + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 psllq $1,%xmm0 - pxor %xmm3,%xmm0# - psllq $57,%xmm0# - movdqa %xmm0,%xmm3# + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm3# + psrldq $8,%xmm3 pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1# + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm1# + pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 - pxor %xmm4,%xmm0# - psrlq $1,%xmm0# - pxor %xmm1,%xmm0# + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 movdqa %xmm0,%xmm5 - movdqa %xmm0,%xmm1# + movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,222,0 - pxor %xmm0,%xmm3# - pxor %xmm1,%xmm3# + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 - movdqa %xmm3,%xmm4# + movdqa %xmm3,%xmm4 psrldq $8,%xmm3 - pslldq $8,%xmm4# + pslldq $8,%xmm4 pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0# + pxor %xmm4,%xmm0 - movdqa %xmm0,%xmm4# + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 psllq $1,%xmm0 - pxor %xmm3,%xmm0# - psllq $57,%xmm0# - movdqa %xmm0,%xmm3# + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm3# + psrldq $8,%xmm3 pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1# + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm1# + pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 - pxor %xmm4,%xmm0# - psrlq $1,%xmm0# - pxor %xmm1,%xmm0# + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 pshufd $78,%xmm5,%xmm3 pshufd $78,%xmm0,%xmm4 pxor %xmm5,%xmm3 @@ -875,43 +875,43 @@ L$_gmult_clmul: movdqu (%rsi),%xmm2 movdqu 32(%rsi),%xmm4 .byte 102,15,56,0,197 - movdqa %xmm0,%xmm1# + movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,220,0 - pxor %xmm0,%xmm3# - pxor %xmm1,%xmm3# + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 - movdqa %xmm3,%xmm4# + movdqa %xmm3,%xmm4 psrldq $8,%xmm3 - pslldq $8,%xmm4# + pslldq $8,%xmm4 pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0# + pxor %xmm4,%xmm0 - movdqa %xmm0,%xmm4# + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 psllq $1,%xmm0 - pxor %xmm3,%xmm0# - psllq $57,%xmm0# - movdqa %xmm0,%xmm3# + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm3# + psrldq $8,%xmm3 pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1# + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm1# + pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 - pxor %xmm4,%xmm0# - psrlq $1,%xmm0# - pxor %xmm1,%xmm0# + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 .byte 102,15,56,0,197 movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 @@ -949,9 +949,9 @@ L$_ghash_clmul: movdqu 48(%rsi),%xmm14 movdqu 64(%rsi),%xmm15 -####### -# + + movdqu 48(%rdx),%xmm3 movdqu 32(%rdx),%xmm11 .byte 102,65,15,56,0,218 @@ -1018,28 +1018,28 @@ L$mod4_loop: pxor %xmm0,%xmm8 movdqa %xmm3,%xmm5 - pxor %xmm1,%xmm8# + pxor %xmm1,%xmm8 pxor %xmm3,%xmm4 - movdqa %xmm8,%xmm9# + movdqa %xmm8,%xmm9 .byte 102,68,15,58,68,234,17 pslldq $8,%xmm8 - psrldq $8,%xmm9# + psrldq $8,%xmm9 pxor %xmm8,%xmm0 movdqa L$7_mask(%rip),%xmm8 - pxor %xmm9,%xmm1# + pxor %xmm9,%xmm1 .byte 102,76,15,110,200 pand %xmm0,%xmm8 .byte 102,69,15,56,0,200 - pxor %xmm0,%xmm9# + pxor %xmm0,%xmm9 .byte 102,68,15,58,68,231,0 - psllq $57,%xmm9# - movdqa %xmm9,%xmm8# + psllq $57,%xmm9 + movdqa %xmm9,%xmm8 pslldq $8,%xmm9 .byte 102,15,58,68,222,0 - psrldq $8,%xmm8# + psrldq $8,%xmm8 pxor %xmm9,%xmm0 - pxor %xmm8,%xmm1# + pxor %xmm8,%xmm1 movdqu 0(%rdx),%xmm8 movdqa %xmm0,%xmm9 @@ -1052,19 +1052,19 @@ L$mod4_loop: xorps %xmm13,%xmm5 movups 80(%rsi),%xmm7 .byte 102,69,15,56,0,194 - pxor %xmm9,%xmm1# + pxor %xmm9,%xmm1 pxor %xmm0,%xmm9 psrlq $5,%xmm0 movdqa %xmm11,%xmm13 pxor %xmm12,%xmm4 pshufd $78,%xmm11,%xmm12 - pxor %xmm9,%xmm0# + pxor %xmm9,%xmm0 pxor %xmm8,%xmm1 pxor %xmm11,%xmm12 .byte 102,69,15,58,68,222,0 - psrlq $1,%xmm0# - pxor %xmm1,%xmm0# + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 movdqa %xmm0,%xmm1 .byte 102,69,15,58,68,238,17 xorps %xmm11,%xmm3 @@ -1088,48 +1088,48 @@ L$tail4x: pxor %xmm0,%xmm1 pxor %xmm4,%xmm8 - pxor %xmm1,%xmm8# + pxor %xmm1,%xmm8 pxor %xmm0,%xmm1 - movdqa %xmm8,%xmm9# + movdqa %xmm8,%xmm9 psrldq $8,%xmm8 - pslldq $8,%xmm9# + pslldq $8,%xmm9 pxor %xmm8,%xmm1 - pxor %xmm9,%xmm0# + pxor %xmm9,%xmm0 - movdqa %xmm0,%xmm4# + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 psllq $1,%xmm0 - pxor %xmm3,%xmm0# - psllq $57,%xmm0# - movdqa %xmm0,%xmm3# + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm3# + psrldq $8,%xmm3 pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1# + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm1# + pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 - pxor %xmm4,%xmm0# - psrlq $1,%xmm0# - pxor %xmm1,%xmm0# + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 addq $0x40,%rcx jz L$done movdqu 32(%rsi),%xmm7 subq $0x10,%rcx jz L$odd_tail L$skip4x: -####### -# [(H*Ii+1) + (H*Xi+1)] mod P = -# [(H*Ii+1) + H^2*(Ii+Xi)] mod P -# + + + + movdqu (%rdx),%xmm8 movdqu 16(%rdx),%xmm3 .byte 102,69,15,56,0,194 @@ -1154,8 +1154,8 @@ L$skip4x: L$mod_loop: movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm8 - pshufd $78,%xmm0,%xmm4# - pxor %xmm0,%xmm4# + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 .byte 102,15,58,68,198,0 .byte 102,15,58,68,206,17 @@ -1172,41 +1172,41 @@ L$mod_loop: pxor %xmm9,%xmm1 pxor %xmm8,%xmm4 .byte 102,65,15,56,0,218 - movdqa %xmm4,%xmm8# + movdqa %xmm4,%xmm8 psrldq $8,%xmm8 - pslldq $8,%xmm4# + pslldq $8,%xmm4 pxor %xmm8,%xmm1 - pxor %xmm4,%xmm0# + pxor %xmm4,%xmm0 - movdqa %xmm3,%xmm5# + movdqa %xmm3,%xmm5 movdqa %xmm0,%xmm9 movdqa %xmm0,%xmm8 psllq $5,%xmm0 - pxor %xmm0,%xmm8# + pxor %xmm0,%xmm8 .byte 102,15,58,68,218,0 psllq $1,%xmm0 - pxor %xmm8,%xmm0# - psllq $57,%xmm0# - movdqa %xmm0,%xmm8# + pxor %xmm8,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm8 pslldq $8,%xmm0 - psrldq $8,%xmm8# + psrldq $8,%xmm8 pxor %xmm9,%xmm0 pshufd $78,%xmm5,%xmm4 - pxor %xmm8,%xmm1# - pxor %xmm5,%xmm4# + pxor %xmm8,%xmm1 + pxor %xmm5,%xmm4 movdqa %xmm0,%xmm9 psrlq $1,%xmm0 .byte 102,15,58,68,234,17 - pxor %xmm9,%xmm1# + pxor %xmm9,%xmm1 pxor %xmm0,%xmm9 psrlq $5,%xmm0 - pxor %xmm9,%xmm0# + pxor %xmm9,%xmm0 leaq 32(%rdx),%rdx - psrlq $1,%xmm0# + psrlq $1,%xmm0 .byte 102,15,58,68,231,0 - pxor %xmm1,%xmm0# + pxor %xmm1,%xmm0 subq $0x20,%rcx ja L$mod_loop @@ -1214,8 +1214,8 @@ L$mod_loop: L$even_tail: movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm8 - pshufd $78,%xmm0,%xmm4# - pxor %xmm0,%xmm4# + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 .byte 102,15,58,68,198,0 .byte 102,15,58,68,206,17 @@ -1226,34 +1226,34 @@ L$even_tail: pxor %xmm0,%xmm8 pxor %xmm1,%xmm8 pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm8# + movdqa %xmm4,%xmm8 psrldq $8,%xmm8 - pslldq $8,%xmm4# + pslldq $8,%xmm4 pxor %xmm8,%xmm1 - pxor %xmm4,%xmm0# + pxor %xmm4,%xmm0 - movdqa %xmm0,%xmm4# + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 psllq $1,%xmm0 - pxor %xmm3,%xmm0# - psllq $57,%xmm0# - movdqa %xmm0,%xmm3# + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm3# + psrldq $8,%xmm3 pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1# + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm1# + pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 - pxor %xmm4,%xmm0# - psrlq $1,%xmm0# - pxor %xmm1,%xmm0# + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 testq %rcx,%rcx jnz L$done @@ -1261,43 +1261,43 @@ L$odd_tail: movdqu (%rdx),%xmm8 .byte 102,69,15,56,0,194 pxor %xmm8,%xmm0 - movdqa %xmm0,%xmm1# + movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,223,0 - pxor %xmm0,%xmm3# - pxor %xmm1,%xmm3# + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 - movdqa %xmm3,%xmm4# + movdqa %xmm3,%xmm4 psrldq $8,%xmm3 - pslldq $8,%xmm4# + pslldq $8,%xmm4 pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0# + pxor %xmm4,%xmm0 - movdqa %xmm0,%xmm4# + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 - pxor %xmm0,%xmm3# + pxor %xmm0,%xmm3 psllq $1,%xmm0 - pxor %xmm3,%xmm0# - psllq $57,%xmm0# - movdqa %xmm0,%xmm3# + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm3# + psrldq $8,%xmm3 pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1# + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm1# + pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 - pxor %xmm4,%xmm0# - psrlq $1,%xmm0# - pxor %xmm1,%xmm0# + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 L$done: .byte 102,65,15,56,0,194 movdqu %xmm0,(%rdi) @@ -1319,7 +1319,7 @@ _gcm_init_avx: vpshufd $255,%xmm2,%xmm4 vpsrlq $63,%xmm2,%xmm3 vpsllq $1,%xmm2,%xmm2 - vpxor %xmm5,%xmm5,%xmm5# + vpxor %xmm5,%xmm5,%xmm5 vpcmpgtd %xmm4,%xmm5,%xmm5 vpslldq $8,%xmm3,%xmm3 vpor %xmm3,%xmm2,%xmm2 @@ -1338,65 +1338,65 @@ L$init_loop_avx: vpalignr $8,%xmm3,%xmm4,%xmm5 vmovdqu %xmm5,-16(%rdi) vpunpckhqdq %xmm0,%xmm0,%xmm3 - vpxor %xmm0,%xmm3,%xmm3# - vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1####### - vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0####### - vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3####### - vpxor %xmm0,%xmm1,%xmm4# - vpxor %xmm4,%xmm3,%xmm3# - - vpslldq $8,%xmm3,%xmm4# + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 vpsrldq $8,%xmm3,%xmm3 - vpxor %xmm4,%xmm0,%xmm0# + vpxor %xmm4,%xmm0,%xmm0 vpxor %xmm3,%xmm1,%xmm1 vpsllq $57,%xmm0,%xmm3 vpsllq $62,%xmm0,%xmm4 - vpxor %xmm3,%xmm4,%xmm4# + vpxor %xmm3,%xmm4,%xmm4 vpsllq $63,%xmm0,%xmm3 - vpxor %xmm3,%xmm4,%xmm4# - vpslldq $8,%xmm4,%xmm3# + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 vpsrldq $8,%xmm4,%xmm4 - vpxor %xmm3,%xmm0,%xmm0# + vpxor %xmm3,%xmm0,%xmm0 vpxor %xmm4,%xmm1,%xmm1 vpsrlq $1,%xmm0,%xmm4 vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm0,%xmm0# + vpxor %xmm4,%xmm0,%xmm0 vpsrlq $5,%xmm4,%xmm4 - vpxor %xmm4,%xmm0,%xmm0# - vpsrlq $1,%xmm0,%xmm0# - vpxor %xmm1,%xmm0,%xmm0# + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 L$init_start_avx: vmovdqa %xmm0,%xmm5 vpunpckhqdq %xmm0,%xmm0,%xmm3 - vpxor %xmm0,%xmm3,%xmm3# - vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1####### - vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0####### - vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3####### - vpxor %xmm0,%xmm1,%xmm4# - vpxor %xmm4,%xmm3,%xmm3# - - vpslldq $8,%xmm3,%xmm4# + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 vpsrldq $8,%xmm3,%xmm3 - vpxor %xmm4,%xmm0,%xmm0# + vpxor %xmm4,%xmm0,%xmm0 vpxor %xmm3,%xmm1,%xmm1 vpsllq $57,%xmm0,%xmm3 vpsllq $62,%xmm0,%xmm4 - vpxor %xmm3,%xmm4,%xmm4# + vpxor %xmm3,%xmm4,%xmm4 vpsllq $63,%xmm0,%xmm3 - vpxor %xmm3,%xmm4,%xmm4# - vpslldq $8,%xmm4,%xmm3# + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 vpsrldq $8,%xmm4,%xmm4 - vpxor %xmm3,%xmm0,%xmm0# + vpxor %xmm3,%xmm0,%xmm0 vpxor %xmm4,%xmm1,%xmm1 vpsrlq $1,%xmm0,%xmm4 vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm0,%xmm0# + vpxor %xmm4,%xmm0,%xmm0 vpsrlq $5,%xmm4,%xmm4 - vpxor %xmm4,%xmm0,%xmm0# - vpsrlq $1,%xmm0,%xmm0# - vpxor %xmm1,%xmm0,%xmm0# + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 vpshufd $78,%xmm5,%xmm3 vpshufd $78,%xmm0,%xmm4 vpxor %xmm5,%xmm3,%xmm3 diff --git a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S index 59d4596d6ca608..237c0a3279dd5d 100644 --- a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S +++ b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S @@ -34,7 +34,7 @@ L$ord: L$ordK: .quad 0xccd1c8aaee00bc4f -################################################################################ + .globl _ecp_nistz256_neg .private_extern _ecp_nistz256_neg @@ -90,7 +90,7 @@ L$neg_epilogue: .byte 0xf3,0xc3 -################################################################################ + @@ -126,7 +126,7 @@ L$ord_mul_body: leaq L$ord(%rip),%r14 movq L$ordK(%rip),%r15 -################################ + movq %rax,%rcx mulq 0(%rsi) movq %rax,%r8 @@ -154,7 +154,7 @@ L$ord_mul_body: adcq $0,%rdx movq %rdx,%r12 -################################ + mulq 0(%r14) movq %r8,%rbp addq %rax,%r13 @@ -184,7 +184,7 @@ L$ord_mul_body: adcq %rbp,%r12 adcq $0,%r13 -################################ + movq %rax,%rcx mulq 0(%rsi) addq %rax,%r9 @@ -220,7 +220,7 @@ L$ord_mul_body: adcq %rdx,%r13 adcq $0,%r8 -################################ + mulq 0(%r14) movq %r9,%rbp addq %rax,%rcx @@ -249,7 +249,7 @@ L$ord_mul_body: adcq %rbp,%r13 adcq $0,%r8 -################################# + movq %rax,%rcx mulq 0(%rsi) addq %rax,%r10 @@ -285,7 +285,7 @@ L$ord_mul_body: adcq %rdx,%r8 adcq $0,%r9 -################################ + mulq 0(%r14) movq %r10,%rbp addq %rax,%rcx @@ -314,7 +314,7 @@ L$ord_mul_body: adcq %rbp,%r8 adcq $0,%r9 -################################ + movq %rax,%rcx mulq 0(%rsi) addq %rax,%r11 @@ -350,7 +350,7 @@ L$ord_mul_body: adcq %rdx,%r9 adcq $0,%r10 -################################ + mulq 0(%r14) movq %r11,%rbp addq %rax,%rcx @@ -378,7 +378,7 @@ L$ord_mul_body: adcq %rbp,%r9 adcq $0,%r10 -################################ + movq %r12,%rsi subq 0(%r14),%r12 movq %r13,%r11 @@ -418,7 +418,7 @@ L$ord_mul_epilogue: -################################################################################ + @@ -459,7 +459,7 @@ L$ord_sqr_body: .p2align 5 L$oop_ord_sqr: -################################ + movq %rax,%rbp mulq %r8 movq %rax,%r9 @@ -481,13 +481,13 @@ L$oop_ord_sqr: adcq $0,%rdx movq %rdx,%r12 -################################ + mulq %r14 movq %rax,%r13 movq %r14,%rax movq %rdx,%r14 -################################ + mulq %rbp addq %rax,%r11 movq %r15,%rax @@ -502,7 +502,7 @@ L$oop_ord_sqr: adcq %rdx,%r13 adcq $0,%r14 -################################ + xorq %r15,%r15 movq %r8,%rax addq %r9,%r9 @@ -513,7 +513,7 @@ L$oop_ord_sqr: adcq %r14,%r14 adcq $0,%r15 -################################ + mulq %rax movq %rax,%r8 .byte 102,72,15,126,200 @@ -542,7 +542,7 @@ L$oop_ord_sqr: movq 0(%rsi),%rax adcq %rdx,%r15 -################################ + mulq %r8 movq %r8,%rbp addq %rax,%rcx @@ -573,7 +573,7 @@ L$oop_ord_sqr: addq %rbp,%r11 adcq $0,%r8 -################################ + mulq %r9 movq %r9,%rbp addq %rax,%rcx @@ -604,7 +604,7 @@ L$oop_ord_sqr: addq %rbp,%r8 adcq $0,%r9 -################################ + mulq %r10 movq %r10,%rbp addq %rax,%rcx @@ -635,7 +635,7 @@ L$oop_ord_sqr: addq %rbp,%r9 adcq $0,%r10 -################################ + mulq %r11 movq %r11,%rbp addq %rax,%rcx @@ -662,7 +662,7 @@ L$oop_ord_sqr: addq %rbp,%r10 adcq $0,%r11 -################################ + xorq %rdx,%rdx addq %r12,%r8 adcq %r13,%r9 @@ -672,7 +672,7 @@ L$oop_ord_sqr: movq %r9,%rax adcq $0,%rdx -################################ + subq 0(%rsi),%r8 movq %r10,%r14 sbbq 8(%rsi),%r9 @@ -715,7 +715,7 @@ L$ord_sqr_epilogue: .byte 0xf3,0xc3 -################################################################################ + .p2align 5 ecp_nistz256_ord_mul_montx: @@ -745,7 +745,7 @@ L$ord_mulx_body: leaq L$ord-128(%rip),%r14 movq L$ordK(%rip),%r15 -################################ + mulxq %r9,%r8,%r9 mulxq %r10,%rcx,%r10 mulxq %r11,%rbp,%r11 @@ -757,7 +757,7 @@ L$ord_mulx_body: adcq %rcx,%r11 adcq $0,%r12 -################################ + xorq %r13,%r13 mulxq 0+128(%r14),%rcx,%rbp adcxq %rcx,%r8 @@ -779,7 +779,7 @@ L$ord_mulx_body: adoxq %r8,%r13 adcq $0,%r13 -################################ + mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r9 adoxq %rbp,%r10 @@ -802,7 +802,7 @@ L$ord_mulx_body: adoxq %r8,%r8 adcq $0,%r8 -################################ + mulxq 0+128(%r14),%rcx,%rbp adcxq %rcx,%r9 adoxq %rbp,%r10 @@ -823,7 +823,7 @@ L$ord_mulx_body: adoxq %r9,%r8 adcq $0,%r8 -################################ + mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 @@ -846,7 +846,7 @@ L$ord_mulx_body: adoxq %r9,%r9 adcq $0,%r9 -################################ + mulxq 0+128(%r14),%rcx,%rbp adcxq %rcx,%r10 adoxq %rbp,%r11 @@ -867,7 +867,7 @@ L$ord_mulx_body: adoxq %r10,%r9 adcq $0,%r9 -################################ + mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 @@ -890,7 +890,7 @@ L$ord_mulx_body: adoxq %r10,%r10 adcq $0,%r10 -################################ + mulxq 0+128(%r14),%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 @@ -913,7 +913,7 @@ L$ord_mulx_body: adoxq %r11,%r10 adcq $0,%r10 -################################# + movq %r8,%rcx subq 0(%r14),%r12 @@ -992,7 +992,7 @@ L$oop_ord_sqrx: adcq %rbp,%r11 adcq $0,%r12 xorq %r13,%r13 -################################# + mulxq %r15,%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 @@ -1002,7 +1002,7 @@ L$oop_ord_sqrx: adcxq %rcx,%r12 adoxq %rbp,%r13 adcq $0,%r13 -################################# + mulxq %r8,%rcx,%r14 movq %rax,%rdx .byte 102,73,15,110,216 @@ -1012,7 +1012,7 @@ L$oop_ord_sqrx: adcxq %r10,%r10 adoxq %r15,%r14 -################################ + mulxq %rdx,%r8,%rbp .byte 102,72,15,126,202 adcxq %r11,%r11 @@ -1034,7 +1034,7 @@ L$oop_ord_sqrx: adoxq %rcx,%r14 adoxq %rax,%r15 -################################ + movq %r8,%rdx mulxq 32(%rsi),%rdx,%rcx @@ -1053,7 +1053,7 @@ L$oop_ord_sqrx: adoxq %rbp,%r8 adcxq %rax,%r8 -################################# + movq %r9,%rdx mulxq 32(%rsi),%rdx,%rcx @@ -1071,7 +1071,7 @@ L$oop_ord_sqrx: adcxq %rbp,%r9 adoxq %rax,%r9 -################################# + movq %r10,%rdx mulxq 32(%rsi),%rdx,%rcx @@ -1089,7 +1089,7 @@ L$oop_ord_sqrx: adoxq %rbp,%r10 adcxq %rax,%r10 -################################# + movq %r11,%rdx mulxq 32(%rsi),%rdx,%rcx @@ -1107,7 +1107,7 @@ L$oop_ord_sqrx: adcxq %rbp,%r11 adoxq %rax,%r11 -################################ + addq %r8,%r12 adcq %r13,%r9 movq %r12,%rdx @@ -1116,7 +1116,7 @@ L$oop_ord_sqrx: movq %r9,%r14 adcq $0,%rax -################################ + subq 0(%rsi),%r12 movq %r10,%r15 sbbq 8(%rsi),%r9 @@ -1159,7 +1159,7 @@ L$ord_sqrx_epilogue: .byte 0xf3,0xc3 -################################################################################ + @@ -1235,7 +1235,7 @@ L$mul_epilogue: .p2align 5 __ecp_nistz256_mul_montq: -######################################################################## + movq %rax,%rbp mulq %r9 @@ -1264,12 +1264,12 @@ __ecp_nistz256_mul_montq: xorq %r13,%r13 movq %rdx,%r12 -######################################################################## -# + + @@ -1285,7 +1285,7 @@ __ecp_nistz256_mul_montq: adcq $0,%r13 xorq %r8,%r8 -######################################################################## + movq %rax,%rbp mulq 0(%rsi) @@ -1318,7 +1318,7 @@ __ecp_nistz256_mul_montq: adcq %rdx,%r13 adcq $0,%r8 -######################################################################## + movq %r9,%rbp shlq $32,%r9 @@ -1332,7 +1332,7 @@ __ecp_nistz256_mul_montq: adcq $0,%r8 xorq %r9,%r9 -######################################################################## + movq %rax,%rbp mulq 0(%rsi) @@ -1365,7 +1365,7 @@ __ecp_nistz256_mul_montq: adcq %rdx,%r8 adcq $0,%r9 -######################################################################## + movq %r10,%rbp shlq $32,%r10 @@ -1379,7 +1379,7 @@ __ecp_nistz256_mul_montq: adcq $0,%r9 xorq %r10,%r10 -######################################################################## + movq %rax,%rbp mulq 0(%rsi) @@ -1412,7 +1412,7 @@ __ecp_nistz256_mul_montq: adcq %rdx,%r9 adcq $0,%r10 -######################################################################## + movq %r11,%rbp shlq $32,%r11 @@ -1426,7 +1426,7 @@ __ecp_nistz256_mul_montq: movq %r13,%rbp adcq $0,%r10 -######################################################################## + subq $-1,%r12 movq %r8,%rbx @@ -1449,7 +1449,7 @@ __ecp_nistz256_mul_montq: -################################################################################ + @@ -1539,7 +1539,7 @@ __ecp_nistz256_sqr_montq: adcq $0,%rdx movq %rdx,%r12 -################################# + mulq %r14 addq %rax,%r11 movq %r8,%rax @@ -1554,7 +1554,7 @@ __ecp_nistz256_sqr_montq: movq %rdx,%r13 adcq $0,%r13 -################################# + mulq %r15 xorq %r15,%r15 addq %rax,%r13 @@ -1598,7 +1598,7 @@ __ecp_nistz256_sqr_montq: movq L$poly+8(%rip),%rsi movq L$poly+24(%rip),%rbp -########################################## + movq %r8,%rcx @@ -1611,7 +1611,7 @@ __ecp_nistz256_sqr_montq: movq %r9,%rax adcq $0,%rdx -########################################## + movq %r9,%rcx shlq $32,%r9 @@ -1624,7 +1624,7 @@ __ecp_nistz256_sqr_montq: movq %r10,%rax adcq $0,%rdx -########################################## + movq %r10,%rcx shlq $32,%r10 @@ -1637,7 +1637,7 @@ __ecp_nistz256_sqr_montq: movq %r11,%rax adcq $0,%rdx -########################################### + movq %r11,%rcx shlq $32,%r11 @@ -1650,7 +1650,7 @@ __ecp_nistz256_sqr_montq: adcq $0,%rdx xorq %r11,%r11 -############################################ + addq %r8,%r12 adcq %r9,%r13 @@ -1684,7 +1684,7 @@ __ecp_nistz256_sqr_montq: .p2align 5 __ecp_nistz256_mul_montx: -######################################################################## + mulxq %r9,%r8,%r9 mulxq %r10,%rcx,%r10 @@ -1701,7 +1701,7 @@ __ecp_nistz256_mul_montx: shrxq %r14,%r8,%rcx adcq $0,%r12 -######################################################################## + addq %rbp,%r9 adcq %rcx,%r10 @@ -1713,7 +1713,7 @@ __ecp_nistz256_mul_montx: adcq $0,%r13 xorq %r8,%r8 -######################################################################## + mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r9 @@ -1738,7 +1738,7 @@ __ecp_nistz256_mul_montx: adoxq %r8,%r8 adcq $0,%r8 -######################################################################## + addq %rcx,%r10 adcq %rbp,%r11 @@ -1750,7 +1750,7 @@ __ecp_nistz256_mul_montx: adcq $0,%r8 xorq %r9,%r9 -######################################################################## + mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r10 @@ -1775,7 +1775,7 @@ __ecp_nistz256_mul_montx: adoxq %r9,%r9 adcq $0,%r9 -######################################################################## + addq %rcx,%r11 adcq %rbp,%r12 @@ -1787,7 +1787,7 @@ __ecp_nistz256_mul_montx: adcq $0,%r9 xorq %r10,%r10 -######################################################################## + mulxq 0+128(%rsi),%rcx,%rbp adcxq %rcx,%r11 @@ -1812,7 +1812,7 @@ __ecp_nistz256_mul_montx: adoxq %r10,%r10 adcq $0,%r10 -######################################################################## + addq %rcx,%r12 adcq %rbp,%r13 @@ -1825,7 +1825,7 @@ __ecp_nistz256_mul_montx: adcq %rbp,%r9 adcq $0,%r10 -######################################################################## + xorl %eax,%eax movq %r8,%rcx @@ -1863,7 +1863,7 @@ __ecp_nistz256_sqr_montx: adcq $0,%r12 xorq %r13,%r13 -################################# + mulxq %r15,%rcx,%rbp adcxq %rcx,%r11 adoxq %rbp,%r12 @@ -1874,7 +1874,7 @@ __ecp_nistz256_sqr_montx: adoxq %rbp,%r13 adcq $0,%r13 -################################# + mulxq %r8,%rcx,%r14 movq 0+128(%rsi),%rdx xorq %r15,%r15 @@ -1978,7 +1978,7 @@ __ecp_nistz256_sqr_montx: .byte 0xf3,0xc3 -################################################################################ + .globl _ecp_nistz256_select_w5 .private_extern _ecp_nistz256_select_w5 @@ -2045,7 +2045,7 @@ L$select_loop_sse_w5: L$SEH_end_ecp_nistz256_select_w5: -################################################################################ + .globl _ecp_nistz256_select_w7 .private_extern _ecp_nistz256_select_w7 @@ -2100,7 +2100,7 @@ L$select_loop_sse_w7: L$SEH_end_ecp_nistz256_select_w7: -################################################################################ + .p2align 5 @@ -2164,7 +2164,7 @@ L$select_loop_avx2_w5: L$SEH_end_ecp_nistz256_avx2_select_w5: -################################################################################ + .globl _ecp_nistz256_avx2_select_w7 .private_extern _ecp_nistz256_avx2_select_w7 @@ -2857,9 +2857,9 @@ L$add_proceedq: movq 24+32(%rsp),%r12 leaq 192(%rsp),%rdi call __ecp_nistz256_mul_montq -#lea 192(%rsp), %rsi -#lea 32(%rsp), %rdi -#call __ecp_nistz256_mul_by_2 + + + xorq %r11,%r11 addq %r12,%r12 @@ -3096,7 +3096,7 @@ L$add_affineq_body: pcmpeqd %xmm4,%xmm5 pshufd $0xb1,%xmm3,%xmm4 movq 0(%rbx),%rax -#lea 0x00(%rbx), %rbx + movq %r12,%r9 por %xmm3,%xmm4 pshufd $0,%xmm5,%xmm5 @@ -3186,9 +3186,9 @@ L$add_affineq_body: movq 24+128(%rsp),%r12 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montq -#lea 0(%rsp), %rsi -#lea 128(%rsp), %rdi -#call __ecp_nistz256_mul_by_2 + + + xorq %r11,%r11 addq %r12,%r12 @@ -3960,9 +3960,9 @@ L$add_proceedx: movq 24+32(%rsp),%r12 leaq 192(%rsp),%rdi call __ecp_nistz256_mul_montx -#lea 192(%rsp), %rsi -#lea 32(%rsp), %rdi -#call __ecp_nistz256_mul_by_2 + + + xorq %r11,%r11 addq %r12,%r12 @@ -4193,7 +4193,7 @@ L$add_affinex_body: pcmpeqd %xmm4,%xmm5 pshufd $0xb1,%xmm3,%xmm4 movq 0(%rbx),%rdx -#lea 0x00(%rbx), %rbx + movq %r12,%r9 por %xmm3,%xmm4 pshufd $0,%xmm5,%xmm5 @@ -4283,9 +4283,9 @@ L$add_affinex_body: movq 24+128(%rsp),%r12 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montx -#lea 0(%rsp), %rsi -#lea 128(%rsp), %rdi -#call __ecp_nistz256_mul_by_2 + + + xorq %r11,%r11 addq %r12,%r12 diff --git a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S index e65b5d64509e9c..f6f2be7ae1cf25 100644 --- a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S +++ b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S @@ -23,12 +23,10 @@ _CRYPTO_rdrand: xorq %rax,%rax - - -.byte 0x48, 0x0f, 0xc7, 0xf1 +.byte 72,15,199,242 adcq %rax,%rax - movq %rcx,0(%rdi) + movq %rdx,0(%rdi) .byte 0xf3,0xc3 @@ -36,6 +34,7 @@ _CRYPTO_rdrand: + .globl _CRYPTO_rdrand_multiple8_buf .private_extern _CRYPTO_rdrand_multiple8_buf @@ -46,9 +45,7 @@ _CRYPTO_rdrand_multiple8_buf: jz L$out movq $8,%rdx L$loop: - - -.byte 0x48, 0x0f, 0xc7, 0xf1 +.byte 72,15,199,241 jnc L$err movq %rcx,0(%rdi) addq %rdx,%rdi @@ -61,4 +58,5 @@ L$err: xorq %rax,%rax .byte 0xf3,0xc3 + #endif diff --git a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rsaz-avx2.S b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rsaz-avx2.S index b703ae913c8ea6..e9cae78c5dd25f 100644 --- a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rsaz-avx2.S +++ b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rsaz-avx2.S @@ -382,9 +382,9 @@ L$OOP_REDUCE_1024: vpaddq %ymm10,%ymm7,%ymm7 vpmuludq 256-128(%r13),%ymm12,%ymm14 vmovd %eax,%xmm12 -#vmovdqu 32*1-8-128(%r13), %ymm11 + vpaddq %ymm14,%ymm8,%ymm8 -#vmovdqu 32*2-8-128(%r13), %ymm10 + vpbroadcastq %xmm12,%ymm12 vpmuludq 32-8-128(%r13),%ymm13,%ymm11 @@ -460,7 +460,7 @@ L$OOP_REDUCE_1024: addq %r12,%rax vpaddq %ymm14,%ymm7,%ymm7 vpmuludq %ymm12,%ymm11,%ymm11 -#vmovdqu 32*2-24-128(%r13), %ymm14 + movq %rax,%r9 imull %ecx,%eax vpaddq %ymm11,%ymm8,%ymm8 diff --git a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S index 626c7df6688dd9..5e46e81c166382 100644 --- a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S +++ b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S @@ -1818,8 +1818,8 @@ L$prologue_ssse3: movl 20(%rdi),%r9d movl 24(%rdi),%r10d movl 28(%rdi),%r11d -#movdqa K256+512+32(%rip),%xmm8 -#movdqa K256+512+64(%rip),%xmm9 + + jmp L$loop_ssse3 .p2align 4 L$loop_ssse3: diff --git a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S index 5d35ee0463a44a..2733c07e660ab2 100644 --- a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S +++ b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S @@ -13,21 +13,21 @@ #endif .text -## -# -## -# -## -# -# -# -# -## -# -# -# -## -## + + + + + + + + + + + + + + + .p2align 4 _vpaes_encrypt_core: @@ -115,11 +115,11 @@ L$enc_entry: -## -# -## -# -## + + + + + .p2align 4 _vpaes_decrypt_core: @@ -149,9 +149,9 @@ _vpaes_decrypt_core: .p2align 4 L$dec_loop: -## -# -## + + + movdqa -32(%r10),%xmm4 movdqa -16(%r10),%xmm1 .byte 102,15,56,0,226 @@ -223,11 +223,11 @@ L$dec_entry: -######################################################## -# -# -# -######################################################## + + + + + .p2align 4 _vpaes_schedule_core: @@ -268,14 +268,14 @@ L$schedule_go: je L$schedule_192 -## -# -## -# -## -# -# -## + + + + + + + + L$schedule_128: movl $10,%esi @@ -286,21 +286,21 @@ L$oop_schedule_128: call _vpaes_schedule_mangle jmp L$oop_schedule_128 -## -# -## -# -## -# -# -# -# -## -# -# -# -# -## + + + + + + + + + + + + + + + .p2align 4 L$schedule_192: movdqu 8(%rdi),%xmm0 @@ -323,16 +323,16 @@ L$oop_schedule_192: call _vpaes_schedule_192_smear jmp L$oop_schedule_192 -## -# -## -# -## -# -# -# -# -## + + + + + + + + + + .p2align 4 L$schedule_256: movdqu 16(%rdi),%xmm0 @@ -359,16 +359,16 @@ L$oop_schedule_256: jmp L$oop_schedule_256 -## -# -## -# -# -# -# -## -# -## + + + + + + + + + + .p2align 4 L$schedule_mangle_last: @@ -401,20 +401,20 @@ L$schedule_mangle_last_dec: -## -# -## -# -## -# -# -# -# -## -# -# -# -## + + + + + + + + + + + + + + .p2align 4 _vpaes_schedule_192_smear: @@ -430,24 +430,24 @@ _vpaes_schedule_192_smear: -## -# -## -# -## -# -# -# -## -# -# -## -# -# -## -# -# -## + + + + + + + + + + + + + + + + + + .p2align 4 _vpaes_schedule_round: @@ -508,15 +508,15 @@ _vpaes_schedule_low_round: -## -# -## -# -## -# -# -# -## + + + + + + + + + .p2align 4 _vpaes_schedule_transform: @@ -534,29 +534,29 @@ _vpaes_schedule_transform: -## -# -## -# -# -## -# -# -# -# -## -# -# -# -# -# -## -## -# -# -# -# -## + + + + + + + + + + + + + + + + + + + + + + + .p2align 4 _vpaes_schedule_mangle: @@ -628,9 +628,9 @@ L$schedule_mangle_both: -# -# + + .globl _vpaes_set_encrypt_key .private_extern _vpaes_set_encrypt_key @@ -744,12 +744,12 @@ L$cbc_abort: .byte 0xf3,0xc3 -## -# -## -# -# -## + + + + + + .p2align 4 _vpaes_preheat: @@ -765,11 +765,11 @@ _vpaes_preheat: .byte 0xf3,0xc3 -######################################################## -# -# -# -######################################################## + + + + + .p2align 6 _vpaes_consts: @@ -826,10 +826,10 @@ L$k_deskew: .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 -## -# -# -## + + + + L$k_dksd: .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E @@ -843,10 +843,10 @@ L$k_dks9: .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE -## -# -# -## + + + + L$k_dipt: .quad 0x0F505B040B545F00, 0x154A411E114E451A .quad 0x86E383E660056500, 0x12771772F491F194 diff --git a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont.S b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont.S index a7b8d9eee732b0..8d6444cb6f6226 100644 --- a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont.S +++ b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont.S @@ -58,7 +58,7 @@ L$mul_enter: andq $-1024,%r10 -# + @@ -733,11 +733,11 @@ L$sqr8x_prologue: shlq $3+2,%r10 negq %r9 -############################################################## -# + + leaq -64(%rsp,%r9,2),%r11 movq %rsp,%rbp movq (%r8),%r8 @@ -940,7 +940,6 @@ L$mulx4x_page_walk: L$mulx4x_page_walk_done: leaq (%rdx,%r9,1),%r10 -############################################################## @@ -951,7 +950,8 @@ L$mulx4x_page_walk_done: -# + + movq %r9,0(%rsp) shrq $5,%r9 movq %r10,16(%rsp) diff --git a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont5.S b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont5.S index af2c250a71cf86..4bd36feae43b04 100644 --- a/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont5.S +++ b/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont5.S @@ -53,7 +53,7 @@ L$mul_enter: andq $-1024,%r10 -# + @@ -486,7 +486,6 @@ L$mul4x_prologue: leaq (%r9,%r9,2),%r10 negq %r9 -############################################################## @@ -494,7 +493,8 @@ L$mul4x_prologue: -# + + leaq -320(%rsp,%r9,2),%r11 movq %rsp,%rbp subq %rdi,%r11 @@ -565,6 +565,7 @@ L$mul4x_epilogue: .p2align 5 mul4x_internal: + shlq $5,%r9 movd 8(%rax),%xmm5 leaq L$inc(%rip),%rax @@ -1087,6 +1088,7 @@ L$inner4x: movq 24(%rbp),%r15 jmp L$sqr4x_sub_entry + .globl _bn_power5 .private_extern _bn_power5 @@ -1119,13 +1121,13 @@ L$power5_prologue: negq %r9 movq (%r8),%r8 -############################################################## -# + + leaq -320(%rsp,%r9,2),%r11 movq %rsp,%rbp subq %rdi,%r11 @@ -1165,15 +1167,15 @@ L$pwr_page_walk_done: movq %r9,%r10 negq %r9 -############################################################## -# -# + + + movq %r8,32(%rsp) movq %rax,40(%rsp) @@ -1231,14 +1233,15 @@ L$power5_epilogue: .p2align 5 _bn_sqr8x_internal: __bn_sqr8x_internal: -############################################################## -# -# -############################################################## + + + + + @@ -2007,8 +2010,10 @@ L$8x_no_tail: .byte 0xf3,0xc3 + .p2align 5 __bn_post4x_internal: + movq 0(%rbp),%r12 leaq (%rdi,%r9,1),%rbx movq %r9,%rcx @@ -2060,11 +2065,13 @@ L$sqr4x_sub_entry: negq %r9 .byte 0xf3,0xc3 + .globl _bn_from_montgomery .private_extern _bn_from_montgomery .p2align 5 _bn_from_montgomery: + testl $7,%r9d jz bn_from_mont8x xorl %eax,%eax @@ -2072,6 +2079,7 @@ _bn_from_montgomery: + .p2align 5 bn_from_mont8x: @@ -2097,13 +2105,13 @@ L$from_prologue: negq %r9 movq (%r8),%r8 -############################################################## -# + + leaq -320(%rsp,%r9,2),%r11 movq %rsp,%rbp subq %rdi,%r11 @@ -2143,15 +2151,15 @@ L$from_page_walk_done: movq %r9,%r10 negq %r9 -############################################################## -# -# + + + movq %r8,32(%rsp) movq %rax,40(%rsp) @@ -2265,7 +2273,6 @@ L$mulx4x_prologue: negq %r9 movq (%r8),%r8 -############################################################## @@ -2273,7 +2280,8 @@ L$mulx4x_prologue: -# + + leaq -320(%rsp,%r9,2),%r11 movq %rsp,%rbp subq %rdi,%r11 @@ -2309,7 +2317,6 @@ L$mulx4x_page_walk: ja L$mulx4x_page_walk L$mulx4x_page_walk_done: -############################################################## @@ -2320,7 +2327,8 @@ L$mulx4x_page_walk_done: -# + + movq %r8,32(%rsp) movq %rax,40(%rsp) @@ -2353,6 +2361,7 @@ L$mulx4x_epilogue: .p2align 5 mulx4x_internal: + movq %r9,8(%rsp) movq %r9,%r10 negq %r9 @@ -2773,6 +2782,7 @@ L$mulx4x_inner: jmp L$sqrx4x_sub_entry + .p2align 5 bn_powerx5: @@ -2798,13 +2808,13 @@ L$powerx5_prologue: negq %r9 movq (%r8),%r8 -############################################################## -# + + leaq -320(%rsp,%r9,2),%r11 movq %rsp,%rbp subq %rdi,%r11 @@ -2844,9 +2854,7 @@ L$pwrx_page_walk_done: movq %r9,%r10 negq %r9 -############################################################## -# @@ -2854,7 +2862,9 @@ L$pwrx_page_walk_done: -# + + + pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,72,15,110,209 @@ -2915,14 +2925,10 @@ L$powerx5_epilogue: _bn_sqrx8x_internal: __bn_sqrx8x_internal: -################################################################## -# -# -################################################################## @@ -2930,7 +2936,6 @@ __bn_sqrx8x_internal: -# @@ -2947,7 +2952,12 @@ __bn_sqrx8x_internal: -# + + + + + + @@ -2979,7 +2989,7 @@ L$sqr8x_zero_start: jnz L$sqrx8x_zero movq 0(%rsi),%rdx -#xor %r9,%r9 + xorq %r10,%r10 xorq %r11,%r11 xorq %r12,%r12 @@ -3096,7 +3106,7 @@ L$sqrx8x_outer_loop: movq %r14,%rdx adoxq %rbx,%r11 adcxq %r12,%r11 -#adox %rbp,%rax + adcxq %rbp,%rax mulxq %r15,%r14,%rbx @@ -3135,7 +3145,7 @@ L$sqrx8x_outer_loop: movq %rax,16+8(%rsp) movq %rdi,24+8(%rsp) -#lea 8*8(%rdi),%rdi + xorl %eax,%eax jmp L$sqrx8x_loop @@ -3258,7 +3268,7 @@ L$sqrx8x_outer_break: adoxq %r11,%r11 movq 16(%rdi),%r12 movq 24(%rdi),%r13 -#jmp .Lsqrx4x_shift_n_add + .p2align 5 L$sqrx4x_shift_n_add: @@ -3323,7 +3333,7 @@ __bn_sqrx8x_reduction: movq 32+8(%rsp),%rbx movq 48+8(%rsp),%rdx leaq -64(%rbp,%r9,1),%rcx -#lea 48+8(%rsp,%r9,2),%rdi + movq %rcx,0+8(%rsp) movq %rdi,8+8(%rsp) @@ -3529,13 +3539,15 @@ L$sqrx8x_no_tail: .p2align 5 + __bn_postx4x_internal: + movq 0(%rbp),%r12 movq %rcx,%r10 movq %rcx,%r9 negq %rax sarq $3+2,%rcx -#lea 48+8(%rsp,%r9),%rdi + .byte 102,72,15,126,202 .byte 102,72,15,126,206 decq %r12 @@ -3578,11 +3590,13 @@ L$sqrx4x_sub_entry: .byte 0xf3,0xc3 + .globl _bn_scatter5 .private_extern _bn_scatter5 .p2align 4 _bn_scatter5: + cmpl $0,%esi jz L$scatter_epilogue leaq (%rdx,%rcx,8),%rdx @@ -3597,15 +3611,18 @@ L$scatter_epilogue: .byte 0xf3,0xc3 + .globl _bn_gather5 .private_extern _bn_gather5 .p2align 5 _bn_gather5: + L$SEH_begin_bn_gather5: -.byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 -.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub src/crypto/fipsmodule/bn/asm/x86_64-mont5.plx108,%rsp +.byte 0x4c,0x8d,0x14,0x24 + +.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 leaq L$inc(%rip),%rax andq $-16,%rsp @@ -3758,9 +3775,11 @@ L$gather: jnz L$gather leaq (%r10),%rsp + .byte 0xf3,0xc3 L$SEH_end_bn_gather5: + .p2align 6 L$inc: .long 0,0, 1,1 diff --git a/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-x86_64.asm b/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-x86_64.asm index bffc13ed9c1ccc..874596d7ea4633 100644 --- a/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-x86_64.asm +++ b/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-x86_64.asm @@ -904,226 +904,6 @@ $L$ecb_enc_ret: DB 0F3h,0C3h ;repret $L$SEH_end_aes_hw_ecb_encrypt: -global aes_hw_ccm64_encrypt_blocks - -ALIGN 16 -aes_hw_ccm64_encrypt_blocks: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aes_hw_ccm64_encrypt_blocks: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - lea rsp,[((-88))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 - movaps XMMWORD[32+rsp],xmm8 - movaps XMMWORD[48+rsp],xmm9 -$L$ccm64_enc_body: - mov eax,DWORD[240+rcx] - movdqu xmm6,XMMWORD[r8] - movdqa xmm9,XMMWORD[$L$increment64] - movdqa xmm7,XMMWORD[$L$bswap_mask] - - shl eax,4 - mov r10d,16 - lea r11,[rcx] - movdqu xmm3,XMMWORD[r9] - movdqa xmm2,xmm6 - lea rcx,[32+rax*1+rcx] -DB 102,15,56,0,247 - sub r10,rax - jmp NEAR $L$ccm64_enc_outer -ALIGN 16 -$L$ccm64_enc_outer: - movups xmm0,XMMWORD[r11] - mov rax,r10 - movups xmm8,XMMWORD[rdi] - - xorps xmm2,xmm0 - movups xmm1,XMMWORD[16+r11] - xorps xmm0,xmm8 - xorps xmm3,xmm0 - movups xmm0,XMMWORD[32+r11] - -$L$ccm64_enc2_loop: -DB 102,15,56,220,209 -DB 102,15,56,220,217 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 -DB 102,15,56,220,208 -DB 102,15,56,220,216 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ccm64_enc2_loop -DB 102,15,56,220,209 -DB 102,15,56,220,217 - paddq xmm6,xmm9 - dec rdx -DB 102,15,56,221,208 -DB 102,15,56,221,216 - - lea rdi,[16+rdi] - xorps xmm8,xmm2 - movdqa xmm2,xmm6 - movups XMMWORD[rsi],xmm8 -DB 102,15,56,0,215 - lea rsi,[16+rsi] - jnz NEAR $L$ccm64_enc_outer - - pxor xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - movups XMMWORD[r9],xmm3 - pxor xmm3,xmm3 - pxor xmm8,xmm8 - pxor xmm6,xmm6 - movaps xmm6,XMMWORD[rsp] - movaps XMMWORD[rsp],xmm0 - movaps xmm7,XMMWORD[16+rsp] - movaps XMMWORD[16+rsp],xmm0 - movaps xmm8,XMMWORD[32+rsp] - movaps XMMWORD[32+rsp],xmm0 - movaps xmm9,XMMWORD[48+rsp] - movaps XMMWORD[48+rsp],xmm0 - lea rsp,[88+rsp] -$L$ccm64_enc_ret: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_aes_hw_ccm64_encrypt_blocks: -global aes_hw_ccm64_decrypt_blocks - -ALIGN 16 -aes_hw_ccm64_decrypt_blocks: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aes_hw_ccm64_decrypt_blocks: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - lea rsp,[((-88))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 - movaps XMMWORD[32+rsp],xmm8 - movaps XMMWORD[48+rsp],xmm9 -$L$ccm64_dec_body: - mov eax,DWORD[240+rcx] - movups xmm6,XMMWORD[r8] - movdqu xmm3,XMMWORD[r9] - movdqa xmm9,XMMWORD[$L$increment64] - movdqa xmm7,XMMWORD[$L$bswap_mask] - - movaps xmm2,xmm6 - mov r10d,eax - mov r11,rcx -DB 102,15,56,0,247 - movups xmm0,XMMWORD[rcx] - movups xmm1,XMMWORD[16+rcx] - lea rcx,[32+rcx] - xorps xmm2,xmm0 -$L$oop_enc1_5: -DB 102,15,56,220,209 - dec eax - movups xmm1,XMMWORD[rcx] - lea rcx,[16+rcx] - jnz NEAR $L$oop_enc1_5 -DB 102,15,56,221,209 - shl r10d,4 - mov eax,16 - movups xmm8,XMMWORD[rdi] - paddq xmm6,xmm9 - lea rdi,[16+rdi] - sub rax,r10 - lea rcx,[32+r10*1+r11] - mov r10,rax - jmp NEAR $L$ccm64_dec_outer -ALIGN 16 -$L$ccm64_dec_outer: - xorps xmm8,xmm2 - movdqa xmm2,xmm6 - movups XMMWORD[rsi],xmm8 - lea rsi,[16+rsi] -DB 102,15,56,0,215 - - sub rdx,1 - jz NEAR $L$ccm64_dec_break - - movups xmm0,XMMWORD[r11] - mov rax,r10 - movups xmm1,XMMWORD[16+r11] - xorps xmm8,xmm0 - xorps xmm2,xmm0 - xorps xmm3,xmm8 - movups xmm0,XMMWORD[32+r11] - jmp NEAR $L$ccm64_dec2_loop -ALIGN 16 -$L$ccm64_dec2_loop: -DB 102,15,56,220,209 -DB 102,15,56,220,217 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 -DB 102,15,56,220,208 -DB 102,15,56,220,216 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ccm64_dec2_loop - movups xmm8,XMMWORD[rdi] - paddq xmm6,xmm9 -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,221,208 -DB 102,15,56,221,216 - lea rdi,[16+rdi] - jmp NEAR $L$ccm64_dec_outer - -ALIGN 16 -$L$ccm64_dec_break: - - mov eax,DWORD[240+r11] - movups xmm0,XMMWORD[r11] - movups xmm1,XMMWORD[16+r11] - xorps xmm8,xmm0 - lea r11,[32+r11] - xorps xmm3,xmm8 -$L$oop_enc1_6: -DB 102,15,56,220,217 - dec eax - movups xmm1,XMMWORD[r11] - lea r11,[16+r11] - jnz NEAR $L$oop_enc1_6 -DB 102,15,56,221,217 - pxor xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - movups XMMWORD[r9],xmm3 - pxor xmm3,xmm3 - pxor xmm8,xmm8 - pxor xmm6,xmm6 - movaps xmm6,XMMWORD[rsp] - movaps XMMWORD[rsp],xmm0 - movaps xmm7,XMMWORD[16+rsp] - movaps XMMWORD[16+rsp],xmm0 - movaps xmm8,XMMWORD[32+rsp] - movaps XMMWORD[32+rsp],xmm0 - movaps xmm9,XMMWORD[48+rsp] - movaps XMMWORD[48+rsp],xmm0 - lea rsp,[88+rsp] -$L$ccm64_dec_ret: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_aes_hw_ccm64_decrypt_blocks: global aes_hw_ctr32_encrypt_blocks ALIGN 16 @@ -1157,12 +937,12 @@ $L$SEH_begin_aes_hw_ctr32_encrypt_blocks: movups xmm1,XMMWORD[16+rcx] lea rcx,[32+rcx] xorps xmm2,xmm0 -$L$oop_enc1_7: +$L$oop_enc1_5: DB 102,15,56,220,209 dec edx movups xmm1,XMMWORD[rcx] lea rcx,[16+rcx] - jnz NEAR $L$oop_enc1_7 + jnz NEAR $L$oop_enc1_5 DB 102,15,56,221,209 pxor xmm0,xmm0 pxor xmm1,xmm1 @@ -1668,2041 +1448,80 @@ DB 102,15,56,221,233 movdqu XMMWORD[48+rsi],xmm5 jmp NEAR $L$ctr32_done -ALIGN 32 -$L$ctr32_loop3: -DB 102,15,56,220,209 - lea rcx,[16+rcx] - dec eax -DB 102,15,56,220,217 -DB 102,15,56,220,225 - movups xmm1,XMMWORD[rcx] - jnz NEAR $L$ctr32_loop3 -DB 102,15,56,221,209 -DB 102,15,56,221,217 -DB 102,15,56,221,225 - - movups xmm10,XMMWORD[rdi] - xorps xmm2,xmm10 - movups XMMWORD[rsi],xmm2 - cmp rdx,2 - jb NEAR $L$ctr32_done - - movups xmm11,XMMWORD[16+rdi] - xorps xmm3,xmm11 - movups XMMWORD[16+rsi],xmm3 - je NEAR $L$ctr32_done - - movups xmm12,XMMWORD[32+rdi] - xorps xmm4,xmm12 - movups XMMWORD[32+rsi],xmm4 - -$L$ctr32_done: - xorps xmm0,xmm0 - xor ebp,ebp - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - movaps xmm6,XMMWORD[((-168))+r11] - movaps XMMWORD[(-168)+r11],xmm0 - movaps xmm7,XMMWORD[((-152))+r11] - movaps XMMWORD[(-152)+r11],xmm0 - movaps xmm8,XMMWORD[((-136))+r11] - movaps XMMWORD[(-136)+r11],xmm0 - movaps xmm9,XMMWORD[((-120))+r11] - movaps XMMWORD[(-120)+r11],xmm0 - movaps xmm10,XMMWORD[((-104))+r11] - movaps XMMWORD[(-104)+r11],xmm0 - movaps xmm11,XMMWORD[((-88))+r11] - movaps XMMWORD[(-88)+r11],xmm0 - movaps xmm12,XMMWORD[((-72))+r11] - movaps XMMWORD[(-72)+r11],xmm0 - movaps xmm13,XMMWORD[((-56))+r11] - movaps XMMWORD[(-56)+r11],xmm0 - movaps xmm14,XMMWORD[((-40))+r11] - movaps XMMWORD[(-40)+r11],xmm0 - movaps xmm15,XMMWORD[((-24))+r11] - movaps XMMWORD[(-24)+r11],xmm0 - movaps XMMWORD[rsp],xmm0 - movaps XMMWORD[16+rsp],xmm0 - movaps XMMWORD[32+rsp],xmm0 - movaps XMMWORD[48+rsp],xmm0 - movaps XMMWORD[64+rsp],xmm0 - movaps XMMWORD[80+rsp],xmm0 - movaps XMMWORD[96+rsp],xmm0 - movaps XMMWORD[112+rsp],xmm0 - mov rbp,QWORD[((-8))+r11] - - lea rsp,[r11] - -$L$ctr32_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_aes_hw_ctr32_encrypt_blocks: -global aes_hw_xts_encrypt - -ALIGN 16 -aes_hw_xts_encrypt: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aes_hw_xts_encrypt: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - - lea r11,[rsp] - - push rbp - - sub rsp,272 - and rsp,-16 - movaps XMMWORD[(-168)+r11],xmm6 - movaps XMMWORD[(-152)+r11],xmm7 - movaps XMMWORD[(-136)+r11],xmm8 - movaps XMMWORD[(-120)+r11],xmm9 - movaps XMMWORD[(-104)+r11],xmm10 - movaps XMMWORD[(-88)+r11],xmm11 - movaps XMMWORD[(-72)+r11],xmm12 - movaps XMMWORD[(-56)+r11],xmm13 - movaps XMMWORD[(-40)+r11],xmm14 - movaps XMMWORD[(-24)+r11],xmm15 -$L$xts_enc_body: - movups xmm2,XMMWORD[r9] - mov eax,DWORD[240+r8] - mov r10d,DWORD[240+rcx] - movups xmm0,XMMWORD[r8] - movups xmm1,XMMWORD[16+r8] - lea r8,[32+r8] - xorps xmm2,xmm0 -$L$oop_enc1_8: -DB 102,15,56,220,209 - dec eax - movups xmm1,XMMWORD[r8] - lea r8,[16+r8] - jnz NEAR $L$oop_enc1_8 -DB 102,15,56,221,209 - movups xmm0,XMMWORD[rcx] - mov rbp,rcx - mov eax,r10d - shl r10d,4 - mov r9,rdx - and rdx,-16 - - movups xmm1,XMMWORD[16+r10*1+rcx] - - movdqa xmm8,XMMWORD[$L$xts_magic] - movdqa xmm15,xmm2 - pshufd xmm9,xmm2,0x5f - pxor xmm1,xmm0 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm10,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm10,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm11,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm11,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm12,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm12,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm13,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm13,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm15 - psrad xmm9,31 - paddq xmm15,xmm15 - pand xmm9,xmm8 - pxor xmm14,xmm0 - pxor xmm15,xmm9 - movaps XMMWORD[96+rsp],xmm1 - - sub rdx,16*6 - jc NEAR $L$xts_enc_short - - mov eax,16+96 - lea rcx,[32+r10*1+rbp] - sub rax,r10 - movups xmm1,XMMWORD[16+rbp] - mov r10,rax - lea r8,[$L$xts_magic] - jmp NEAR $L$xts_enc_grandloop - -ALIGN 32 -$L$xts_enc_grandloop: - movdqu xmm2,XMMWORD[rdi] - movdqa xmm8,xmm0 - movdqu xmm3,XMMWORD[16+rdi] - pxor xmm2,xmm10 - movdqu xmm4,XMMWORD[32+rdi] - pxor xmm3,xmm11 -DB 102,15,56,220,209 - movdqu xmm5,XMMWORD[48+rdi] - pxor xmm4,xmm12 -DB 102,15,56,220,217 - movdqu xmm6,XMMWORD[64+rdi] - pxor xmm5,xmm13 -DB 102,15,56,220,225 - movdqu xmm7,XMMWORD[80+rdi] - pxor xmm8,xmm15 - movdqa xmm9,XMMWORD[96+rsp] - pxor xmm6,xmm14 -DB 102,15,56,220,233 - movups xmm0,XMMWORD[32+rbp] - lea rdi,[96+rdi] - pxor xmm7,xmm8 - - pxor xmm10,xmm9 -DB 102,15,56,220,241 - pxor xmm11,xmm9 - movdqa XMMWORD[rsp],xmm10 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[48+rbp] - pxor xmm12,xmm9 - -DB 102,15,56,220,208 - pxor xmm13,xmm9 - movdqa XMMWORD[16+rsp],xmm11 -DB 102,15,56,220,216 - pxor xmm14,xmm9 - movdqa XMMWORD[32+rsp],xmm12 -DB 102,15,56,220,224 -DB 102,15,56,220,232 - pxor xmm8,xmm9 - movdqa XMMWORD[64+rsp],xmm14 -DB 102,15,56,220,240 -DB 102,15,56,220,248 - movups xmm0,XMMWORD[64+rbp] - movdqa XMMWORD[80+rsp],xmm8 - pshufd xmm9,xmm15,0x5f - jmp NEAR $L$xts_enc_loop6 -ALIGN 32 -$L$xts_enc_loop6: -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,220,225 -DB 102,15,56,220,233 -DB 102,15,56,220,241 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[((-64))+rax*1+rcx] - add rax,32 - -DB 102,15,56,220,208 -DB 102,15,56,220,216 -DB 102,15,56,220,224 -DB 102,15,56,220,232 -DB 102,15,56,220,240 -DB 102,15,56,220,248 - movups xmm0,XMMWORD[((-80))+rax*1+rcx] - jnz NEAR $L$xts_enc_loop6 - - movdqa xmm8,XMMWORD[r8] - movdqa xmm14,xmm9 - paddd xmm9,xmm9 -DB 102,15,56,220,209 - paddq xmm15,xmm15 - psrad xmm14,31 -DB 102,15,56,220,217 - pand xmm14,xmm8 - movups xmm10,XMMWORD[rbp] -DB 102,15,56,220,225 -DB 102,15,56,220,233 -DB 102,15,56,220,241 - pxor xmm15,xmm14 - movaps xmm11,xmm10 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[((-64))+rcx] - - movdqa xmm14,xmm9 -DB 102,15,56,220,208 - paddd xmm9,xmm9 - pxor xmm10,xmm15 -DB 102,15,56,220,216 - psrad xmm14,31 - paddq xmm15,xmm15 -DB 102,15,56,220,224 -DB 102,15,56,220,232 - pand xmm14,xmm8 - movaps xmm12,xmm11 -DB 102,15,56,220,240 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 -DB 102,15,56,220,248 - movups xmm0,XMMWORD[((-48))+rcx] - - paddd xmm9,xmm9 -DB 102,15,56,220,209 - pxor xmm11,xmm15 - psrad xmm14,31 -DB 102,15,56,220,217 - paddq xmm15,xmm15 - pand xmm14,xmm8 -DB 102,15,56,220,225 -DB 102,15,56,220,233 - movdqa XMMWORD[48+rsp],xmm13 - pxor xmm15,xmm14 -DB 102,15,56,220,241 - movaps xmm13,xmm12 - movdqa xmm14,xmm9 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[((-32))+rcx] - - paddd xmm9,xmm9 -DB 102,15,56,220,208 - pxor xmm12,xmm15 - psrad xmm14,31 -DB 102,15,56,220,216 - paddq xmm15,xmm15 - pand xmm14,xmm8 -DB 102,15,56,220,224 -DB 102,15,56,220,232 -DB 102,15,56,220,240 - pxor xmm15,xmm14 - movaps xmm14,xmm13 -DB 102,15,56,220,248 - - movdqa xmm0,xmm9 - paddd xmm9,xmm9 -DB 102,15,56,220,209 - pxor xmm13,xmm15 - psrad xmm0,31 -DB 102,15,56,220,217 - paddq xmm15,xmm15 - pand xmm0,xmm8 -DB 102,15,56,220,225 -DB 102,15,56,220,233 - pxor xmm15,xmm0 - movups xmm0,XMMWORD[rbp] -DB 102,15,56,220,241 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[16+rbp] - - pxor xmm14,xmm15 -DB 102,15,56,221,84,36,0 - psrad xmm9,31 - paddq xmm15,xmm15 -DB 102,15,56,221,92,36,16 -DB 102,15,56,221,100,36,32 - pand xmm9,xmm8 - mov rax,r10 -DB 102,15,56,221,108,36,48 -DB 102,15,56,221,116,36,64 -DB 102,15,56,221,124,36,80 - pxor xmm15,xmm9 - - lea rsi,[96+rsi] - movups XMMWORD[(-96)+rsi],xmm2 - movups XMMWORD[(-80)+rsi],xmm3 - movups XMMWORD[(-64)+rsi],xmm4 - movups XMMWORD[(-48)+rsi],xmm5 - movups XMMWORD[(-32)+rsi],xmm6 - movups XMMWORD[(-16)+rsi],xmm7 - sub rdx,16*6 - jnc NEAR $L$xts_enc_grandloop - - mov eax,16+96 - sub eax,r10d - mov rcx,rbp - shr eax,4 - -$L$xts_enc_short: - - mov r10d,eax - pxor xmm10,xmm0 - add rdx,16*6 - jz NEAR $L$xts_enc_done - - pxor xmm11,xmm0 - cmp rdx,0x20 - jb NEAR $L$xts_enc_one - pxor xmm12,xmm0 - je NEAR $L$xts_enc_two - - pxor xmm13,xmm0 - cmp rdx,0x40 - jb NEAR $L$xts_enc_three - pxor xmm14,xmm0 - je NEAR $L$xts_enc_four - - movdqu xmm2,XMMWORD[rdi] - movdqu xmm3,XMMWORD[16+rdi] - movdqu xmm4,XMMWORD[32+rdi] - pxor xmm2,xmm10 - movdqu xmm5,XMMWORD[48+rdi] - pxor xmm3,xmm11 - movdqu xmm6,XMMWORD[64+rdi] - lea rdi,[80+rdi] - pxor xmm4,xmm12 - pxor xmm5,xmm13 - pxor xmm6,xmm14 - pxor xmm7,xmm7 - - call _aesni_encrypt6 - - xorps xmm2,xmm10 - movdqa xmm10,xmm15 - xorps xmm3,xmm11 - xorps xmm4,xmm12 - movdqu XMMWORD[rsi],xmm2 - xorps xmm5,xmm13 - movdqu XMMWORD[16+rsi],xmm3 - xorps xmm6,xmm14 - movdqu XMMWORD[32+rsi],xmm4 - movdqu XMMWORD[48+rsi],xmm5 - movdqu XMMWORD[64+rsi],xmm6 - lea rsi,[80+rsi] - jmp NEAR $L$xts_enc_done - -ALIGN 16 -$L$xts_enc_one: - movups xmm2,XMMWORD[rdi] - lea rdi,[16+rdi] - xorps xmm2,xmm10 - movups xmm0,XMMWORD[rcx] - movups xmm1,XMMWORD[16+rcx] - lea rcx,[32+rcx] - xorps xmm2,xmm0 -$L$oop_enc1_9: -DB 102,15,56,220,209 - dec eax - movups xmm1,XMMWORD[rcx] - lea rcx,[16+rcx] - jnz NEAR $L$oop_enc1_9 -DB 102,15,56,221,209 - xorps xmm2,xmm10 - movdqa xmm10,xmm11 - movups XMMWORD[rsi],xmm2 - lea rsi,[16+rsi] - jmp NEAR $L$xts_enc_done - -ALIGN 16 -$L$xts_enc_two: - movups xmm2,XMMWORD[rdi] - movups xmm3,XMMWORD[16+rdi] - lea rdi,[32+rdi] - xorps xmm2,xmm10 - xorps xmm3,xmm11 - - call _aesni_encrypt2 - - xorps xmm2,xmm10 - movdqa xmm10,xmm12 - xorps xmm3,xmm11 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - lea rsi,[32+rsi] - jmp NEAR $L$xts_enc_done - -ALIGN 16 -$L$xts_enc_three: - movups xmm2,XMMWORD[rdi] - movups xmm3,XMMWORD[16+rdi] - movups xmm4,XMMWORD[32+rdi] - lea rdi,[48+rdi] - xorps xmm2,xmm10 - xorps xmm3,xmm11 - xorps xmm4,xmm12 - - call _aesni_encrypt3 - - xorps xmm2,xmm10 - movdqa xmm10,xmm13 - xorps xmm3,xmm11 - xorps xmm4,xmm12 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - movups XMMWORD[32+rsi],xmm4 - lea rsi,[48+rsi] - jmp NEAR $L$xts_enc_done - -ALIGN 16 -$L$xts_enc_four: - movups xmm2,XMMWORD[rdi] - movups xmm3,XMMWORD[16+rdi] - movups xmm4,XMMWORD[32+rdi] - xorps xmm2,xmm10 - movups xmm5,XMMWORD[48+rdi] - lea rdi,[64+rdi] - xorps xmm3,xmm11 - xorps xmm4,xmm12 - xorps xmm5,xmm13 - - call _aesni_encrypt4 - - pxor xmm2,xmm10 - movdqa xmm10,xmm14 - pxor xmm3,xmm11 - pxor xmm4,xmm12 - movdqu XMMWORD[rsi],xmm2 - pxor xmm5,xmm13 - movdqu XMMWORD[16+rsi],xmm3 - movdqu XMMWORD[32+rsi],xmm4 - movdqu XMMWORD[48+rsi],xmm5 - lea rsi,[64+rsi] - jmp NEAR $L$xts_enc_done - -ALIGN 16 -$L$xts_enc_done: - and r9,15 - jz NEAR $L$xts_enc_ret - mov rdx,r9 - -$L$xts_enc_steal: - movzx eax,BYTE[rdi] - movzx ecx,BYTE[((-16))+rsi] - lea rdi,[1+rdi] - mov BYTE[((-16))+rsi],al - mov BYTE[rsi],cl - lea rsi,[1+rsi] - sub rdx,1 - jnz NEAR $L$xts_enc_steal - - sub rsi,r9 - mov rcx,rbp - mov eax,r10d - - movups xmm2,XMMWORD[((-16))+rsi] - xorps xmm2,xmm10 - movups xmm0,XMMWORD[rcx] - movups xmm1,XMMWORD[16+rcx] - lea rcx,[32+rcx] - xorps xmm2,xmm0 -$L$oop_enc1_10: -DB 102,15,56,220,209 - dec eax - movups xmm1,XMMWORD[rcx] - lea rcx,[16+rcx] - jnz NEAR $L$oop_enc1_10 -DB 102,15,56,221,209 - xorps xmm2,xmm10 - movups XMMWORD[(-16)+rsi],xmm2 - -$L$xts_enc_ret: - xorps xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - movaps xmm6,XMMWORD[((-168))+r11] - movaps XMMWORD[(-168)+r11],xmm0 - movaps xmm7,XMMWORD[((-152))+r11] - movaps XMMWORD[(-152)+r11],xmm0 - movaps xmm8,XMMWORD[((-136))+r11] - movaps XMMWORD[(-136)+r11],xmm0 - movaps xmm9,XMMWORD[((-120))+r11] - movaps XMMWORD[(-120)+r11],xmm0 - movaps xmm10,XMMWORD[((-104))+r11] - movaps XMMWORD[(-104)+r11],xmm0 - movaps xmm11,XMMWORD[((-88))+r11] - movaps XMMWORD[(-88)+r11],xmm0 - movaps xmm12,XMMWORD[((-72))+r11] - movaps XMMWORD[(-72)+r11],xmm0 - movaps xmm13,XMMWORD[((-56))+r11] - movaps XMMWORD[(-56)+r11],xmm0 - movaps xmm14,XMMWORD[((-40))+r11] - movaps XMMWORD[(-40)+r11],xmm0 - movaps xmm15,XMMWORD[((-24))+r11] - movaps XMMWORD[(-24)+r11],xmm0 - movaps XMMWORD[rsp],xmm0 - movaps XMMWORD[16+rsp],xmm0 - movaps XMMWORD[32+rsp],xmm0 - movaps XMMWORD[48+rsp],xmm0 - movaps XMMWORD[64+rsp],xmm0 - movaps XMMWORD[80+rsp],xmm0 - movaps XMMWORD[96+rsp],xmm0 - mov rbp,QWORD[((-8))+r11] - - lea rsp,[r11] - -$L$xts_enc_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_aes_hw_xts_encrypt: -global aes_hw_xts_decrypt - -ALIGN 16 -aes_hw_xts_decrypt: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aes_hw_xts_decrypt: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - - lea r11,[rsp] - - push rbp - - sub rsp,272 - and rsp,-16 - movaps XMMWORD[(-168)+r11],xmm6 - movaps XMMWORD[(-152)+r11],xmm7 - movaps XMMWORD[(-136)+r11],xmm8 - movaps XMMWORD[(-120)+r11],xmm9 - movaps XMMWORD[(-104)+r11],xmm10 - movaps XMMWORD[(-88)+r11],xmm11 - movaps XMMWORD[(-72)+r11],xmm12 - movaps XMMWORD[(-56)+r11],xmm13 - movaps XMMWORD[(-40)+r11],xmm14 - movaps XMMWORD[(-24)+r11],xmm15 -$L$xts_dec_body: - movups xmm2,XMMWORD[r9] - mov eax,DWORD[240+r8] - mov r10d,DWORD[240+rcx] - movups xmm0,XMMWORD[r8] - movups xmm1,XMMWORD[16+r8] - lea r8,[32+r8] - xorps xmm2,xmm0 -$L$oop_enc1_11: -DB 102,15,56,220,209 - dec eax - movups xmm1,XMMWORD[r8] - lea r8,[16+r8] - jnz NEAR $L$oop_enc1_11 -DB 102,15,56,221,209 - xor eax,eax - test rdx,15 - setnz al - shl rax,4 - sub rdx,rax - - movups xmm0,XMMWORD[rcx] - mov rbp,rcx - mov eax,r10d - shl r10d,4 - mov r9,rdx - and rdx,-16 - - movups xmm1,XMMWORD[16+r10*1+rcx] - - movdqa xmm8,XMMWORD[$L$xts_magic] - movdqa xmm15,xmm2 - pshufd xmm9,xmm2,0x5f - pxor xmm1,xmm0 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm10,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm10,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm11,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm11,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm12,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm12,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm13,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm13,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm15 - psrad xmm9,31 - paddq xmm15,xmm15 - pand xmm9,xmm8 - pxor xmm14,xmm0 - pxor xmm15,xmm9 - movaps XMMWORD[96+rsp],xmm1 - - sub rdx,16*6 - jc NEAR $L$xts_dec_short - - mov eax,16+96 - lea rcx,[32+r10*1+rbp] - sub rax,r10 - movups xmm1,XMMWORD[16+rbp] - mov r10,rax - lea r8,[$L$xts_magic] - jmp NEAR $L$xts_dec_grandloop - -ALIGN 32 -$L$xts_dec_grandloop: - movdqu xmm2,XMMWORD[rdi] - movdqa xmm8,xmm0 - movdqu xmm3,XMMWORD[16+rdi] - pxor xmm2,xmm10 - movdqu xmm4,XMMWORD[32+rdi] - pxor xmm3,xmm11 -DB 102,15,56,222,209 - movdqu xmm5,XMMWORD[48+rdi] - pxor xmm4,xmm12 -DB 102,15,56,222,217 - movdqu xmm6,XMMWORD[64+rdi] - pxor xmm5,xmm13 -DB 102,15,56,222,225 - movdqu xmm7,XMMWORD[80+rdi] - pxor xmm8,xmm15 - movdqa xmm9,XMMWORD[96+rsp] - pxor xmm6,xmm14 -DB 102,15,56,222,233 - movups xmm0,XMMWORD[32+rbp] - lea rdi,[96+rdi] - pxor xmm7,xmm8 - - pxor xmm10,xmm9 -DB 102,15,56,222,241 - pxor xmm11,xmm9 - movdqa XMMWORD[rsp],xmm10 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[48+rbp] - pxor xmm12,xmm9 - -DB 102,15,56,222,208 - pxor xmm13,xmm9 - movdqa XMMWORD[16+rsp],xmm11 -DB 102,15,56,222,216 - pxor xmm14,xmm9 - movdqa XMMWORD[32+rsp],xmm12 -DB 102,15,56,222,224 -DB 102,15,56,222,232 - pxor xmm8,xmm9 - movdqa XMMWORD[64+rsp],xmm14 -DB 102,15,56,222,240 -DB 102,15,56,222,248 - movups xmm0,XMMWORD[64+rbp] - movdqa XMMWORD[80+rsp],xmm8 - pshufd xmm9,xmm15,0x5f - jmp NEAR $L$xts_dec_loop6 -ALIGN 32 -$L$xts_dec_loop6: -DB 102,15,56,222,209 -DB 102,15,56,222,217 -DB 102,15,56,222,225 -DB 102,15,56,222,233 -DB 102,15,56,222,241 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[((-64))+rax*1+rcx] - add rax,32 - -DB 102,15,56,222,208 -DB 102,15,56,222,216 -DB 102,15,56,222,224 -DB 102,15,56,222,232 -DB 102,15,56,222,240 -DB 102,15,56,222,248 - movups xmm0,XMMWORD[((-80))+rax*1+rcx] - jnz NEAR $L$xts_dec_loop6 - - movdqa xmm8,XMMWORD[r8] - movdqa xmm14,xmm9 - paddd xmm9,xmm9 -DB 102,15,56,222,209 - paddq xmm15,xmm15 - psrad xmm14,31 -DB 102,15,56,222,217 - pand xmm14,xmm8 - movups xmm10,XMMWORD[rbp] -DB 102,15,56,222,225 -DB 102,15,56,222,233 -DB 102,15,56,222,241 - pxor xmm15,xmm14 - movaps xmm11,xmm10 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[((-64))+rcx] - - movdqa xmm14,xmm9 -DB 102,15,56,222,208 - paddd xmm9,xmm9 - pxor xmm10,xmm15 -DB 102,15,56,222,216 - psrad xmm14,31 - paddq xmm15,xmm15 -DB 102,15,56,222,224 -DB 102,15,56,222,232 - pand xmm14,xmm8 - movaps xmm12,xmm11 -DB 102,15,56,222,240 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 -DB 102,15,56,222,248 - movups xmm0,XMMWORD[((-48))+rcx] - - paddd xmm9,xmm9 -DB 102,15,56,222,209 - pxor xmm11,xmm15 - psrad xmm14,31 -DB 102,15,56,222,217 - paddq xmm15,xmm15 - pand xmm14,xmm8 -DB 102,15,56,222,225 -DB 102,15,56,222,233 - movdqa XMMWORD[48+rsp],xmm13 - pxor xmm15,xmm14 -DB 102,15,56,222,241 - movaps xmm13,xmm12 - movdqa xmm14,xmm9 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[((-32))+rcx] - - paddd xmm9,xmm9 -DB 102,15,56,222,208 - pxor xmm12,xmm15 - psrad xmm14,31 -DB 102,15,56,222,216 - paddq xmm15,xmm15 - pand xmm14,xmm8 -DB 102,15,56,222,224 -DB 102,15,56,222,232 -DB 102,15,56,222,240 - pxor xmm15,xmm14 - movaps xmm14,xmm13 -DB 102,15,56,222,248 - - movdqa xmm0,xmm9 - paddd xmm9,xmm9 -DB 102,15,56,222,209 - pxor xmm13,xmm15 - psrad xmm0,31 -DB 102,15,56,222,217 - paddq xmm15,xmm15 - pand xmm0,xmm8 -DB 102,15,56,222,225 -DB 102,15,56,222,233 - pxor xmm15,xmm0 - movups xmm0,XMMWORD[rbp] -DB 102,15,56,222,241 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[16+rbp] - - pxor xmm14,xmm15 -DB 102,15,56,223,84,36,0 - psrad xmm9,31 - paddq xmm15,xmm15 -DB 102,15,56,223,92,36,16 -DB 102,15,56,223,100,36,32 - pand xmm9,xmm8 - mov rax,r10 -DB 102,15,56,223,108,36,48 -DB 102,15,56,223,116,36,64 -DB 102,15,56,223,124,36,80 - pxor xmm15,xmm9 - - lea rsi,[96+rsi] - movups XMMWORD[(-96)+rsi],xmm2 - movups XMMWORD[(-80)+rsi],xmm3 - movups XMMWORD[(-64)+rsi],xmm4 - movups XMMWORD[(-48)+rsi],xmm5 - movups XMMWORD[(-32)+rsi],xmm6 - movups XMMWORD[(-16)+rsi],xmm7 - sub rdx,16*6 - jnc NEAR $L$xts_dec_grandloop - - mov eax,16+96 - sub eax,r10d - mov rcx,rbp - shr eax,4 - -$L$xts_dec_short: - - mov r10d,eax - pxor xmm10,xmm0 - pxor xmm11,xmm0 - add rdx,16*6 - jz NEAR $L$xts_dec_done - - pxor xmm12,xmm0 - cmp rdx,0x20 - jb NEAR $L$xts_dec_one - pxor xmm13,xmm0 - je NEAR $L$xts_dec_two - - pxor xmm14,xmm0 - cmp rdx,0x40 - jb NEAR $L$xts_dec_three - je NEAR $L$xts_dec_four - - movdqu xmm2,XMMWORD[rdi] - movdqu xmm3,XMMWORD[16+rdi] - movdqu xmm4,XMMWORD[32+rdi] - pxor xmm2,xmm10 - movdqu xmm5,XMMWORD[48+rdi] - pxor xmm3,xmm11 - movdqu xmm6,XMMWORD[64+rdi] - lea rdi,[80+rdi] - pxor xmm4,xmm12 - pxor xmm5,xmm13 - pxor xmm6,xmm14 - - call _aesni_decrypt6 - - xorps xmm2,xmm10 - xorps xmm3,xmm11 - xorps xmm4,xmm12 - movdqu XMMWORD[rsi],xmm2 - xorps xmm5,xmm13 - movdqu XMMWORD[16+rsi],xmm3 - xorps xmm6,xmm14 - movdqu XMMWORD[32+rsi],xmm4 - pxor xmm14,xmm14 - movdqu XMMWORD[48+rsi],xmm5 - pcmpgtd xmm14,xmm15 - movdqu XMMWORD[64+rsi],xmm6 - lea rsi,[80+rsi] - pshufd xmm11,xmm14,0x13 - and r9,15 - jz NEAR $L$xts_dec_ret - - movdqa xmm10,xmm15 - paddq xmm15,xmm15 - pand xmm11,xmm8 - pxor xmm11,xmm15 - jmp NEAR $L$xts_dec_done2 - -ALIGN 16 -$L$xts_dec_one: - movups xmm2,XMMWORD[rdi] - lea rdi,[16+rdi] - xorps xmm2,xmm10 - movups xmm0,XMMWORD[rcx] - movups xmm1,XMMWORD[16+rcx] - lea rcx,[32+rcx] - xorps xmm2,xmm0 -$L$oop_dec1_12: -DB 102,15,56,222,209 - dec eax - movups xmm1,XMMWORD[rcx] - lea rcx,[16+rcx] - jnz NEAR $L$oop_dec1_12 -DB 102,15,56,223,209 - xorps xmm2,xmm10 - movdqa xmm10,xmm11 - movups XMMWORD[rsi],xmm2 - movdqa xmm11,xmm12 - lea rsi,[16+rsi] - jmp NEAR $L$xts_dec_done - -ALIGN 16 -$L$xts_dec_two: - movups xmm2,XMMWORD[rdi] - movups xmm3,XMMWORD[16+rdi] - lea rdi,[32+rdi] - xorps xmm2,xmm10 - xorps xmm3,xmm11 - - call _aesni_decrypt2 - - xorps xmm2,xmm10 - movdqa xmm10,xmm12 - xorps xmm3,xmm11 - movdqa xmm11,xmm13 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - lea rsi,[32+rsi] - jmp NEAR $L$xts_dec_done - -ALIGN 16 -$L$xts_dec_three: - movups xmm2,XMMWORD[rdi] - movups xmm3,XMMWORD[16+rdi] - movups xmm4,XMMWORD[32+rdi] - lea rdi,[48+rdi] - xorps xmm2,xmm10 - xorps xmm3,xmm11 - xorps xmm4,xmm12 - - call _aesni_decrypt3 - - xorps xmm2,xmm10 - movdqa xmm10,xmm13 - xorps xmm3,xmm11 - movdqa xmm11,xmm14 - xorps xmm4,xmm12 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - movups XMMWORD[32+rsi],xmm4 - lea rsi,[48+rsi] - jmp NEAR $L$xts_dec_done - -ALIGN 16 -$L$xts_dec_four: - movups xmm2,XMMWORD[rdi] - movups xmm3,XMMWORD[16+rdi] - movups xmm4,XMMWORD[32+rdi] - xorps xmm2,xmm10 - movups xmm5,XMMWORD[48+rdi] - lea rdi,[64+rdi] - xorps xmm3,xmm11 - xorps xmm4,xmm12 - xorps xmm5,xmm13 - - call _aesni_decrypt4 - - pxor xmm2,xmm10 - movdqa xmm10,xmm14 - pxor xmm3,xmm11 - movdqa xmm11,xmm15 - pxor xmm4,xmm12 - movdqu XMMWORD[rsi],xmm2 - pxor xmm5,xmm13 - movdqu XMMWORD[16+rsi],xmm3 - movdqu XMMWORD[32+rsi],xmm4 - movdqu XMMWORD[48+rsi],xmm5 - lea rsi,[64+rsi] - jmp NEAR $L$xts_dec_done - -ALIGN 16 -$L$xts_dec_done: - and r9,15 - jz NEAR $L$xts_dec_ret -$L$xts_dec_done2: - mov rdx,r9 - mov rcx,rbp - mov eax,r10d - - movups xmm2,XMMWORD[rdi] - xorps xmm2,xmm11 - movups xmm0,XMMWORD[rcx] - movups xmm1,XMMWORD[16+rcx] - lea rcx,[32+rcx] - xorps xmm2,xmm0 -$L$oop_dec1_13: -DB 102,15,56,222,209 - dec eax - movups xmm1,XMMWORD[rcx] - lea rcx,[16+rcx] - jnz NEAR $L$oop_dec1_13 -DB 102,15,56,223,209 - xorps xmm2,xmm11 - movups XMMWORD[rsi],xmm2 - -$L$xts_dec_steal: - movzx eax,BYTE[16+rdi] - movzx ecx,BYTE[rsi] - lea rdi,[1+rdi] - mov BYTE[rsi],al - mov BYTE[16+rsi],cl - lea rsi,[1+rsi] - sub rdx,1 - jnz NEAR $L$xts_dec_steal - - sub rsi,r9 - mov rcx,rbp - mov eax,r10d - - movups xmm2,XMMWORD[rsi] - xorps xmm2,xmm10 - movups xmm0,XMMWORD[rcx] - movups xmm1,XMMWORD[16+rcx] - lea rcx,[32+rcx] - xorps xmm2,xmm0 -$L$oop_dec1_14: -DB 102,15,56,222,209 - dec eax - movups xmm1,XMMWORD[rcx] - lea rcx,[16+rcx] - jnz NEAR $L$oop_dec1_14 -DB 102,15,56,223,209 - xorps xmm2,xmm10 - movups XMMWORD[rsi],xmm2 - -$L$xts_dec_ret: - xorps xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - movaps xmm6,XMMWORD[((-168))+r11] - movaps XMMWORD[(-168)+r11],xmm0 - movaps xmm7,XMMWORD[((-152))+r11] - movaps XMMWORD[(-152)+r11],xmm0 - movaps xmm8,XMMWORD[((-136))+r11] - movaps XMMWORD[(-136)+r11],xmm0 - movaps xmm9,XMMWORD[((-120))+r11] - movaps XMMWORD[(-120)+r11],xmm0 - movaps xmm10,XMMWORD[((-104))+r11] - movaps XMMWORD[(-104)+r11],xmm0 - movaps xmm11,XMMWORD[((-88))+r11] - movaps XMMWORD[(-88)+r11],xmm0 - movaps xmm12,XMMWORD[((-72))+r11] - movaps XMMWORD[(-72)+r11],xmm0 - movaps xmm13,XMMWORD[((-56))+r11] - movaps XMMWORD[(-56)+r11],xmm0 - movaps xmm14,XMMWORD[((-40))+r11] - movaps XMMWORD[(-40)+r11],xmm0 - movaps xmm15,XMMWORD[((-24))+r11] - movaps XMMWORD[(-24)+r11],xmm0 - movaps XMMWORD[rsp],xmm0 - movaps XMMWORD[16+rsp],xmm0 - movaps XMMWORD[32+rsp],xmm0 - movaps XMMWORD[48+rsp],xmm0 - movaps XMMWORD[64+rsp],xmm0 - movaps XMMWORD[80+rsp],xmm0 - movaps XMMWORD[96+rsp],xmm0 - mov rbp,QWORD[((-8))+r11] - - lea rsp,[r11] - -$L$xts_dec_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_aes_hw_xts_decrypt: -global aes_hw_ocb_encrypt - -ALIGN 32 -aes_hw_ocb_encrypt: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aes_hw_ocb_encrypt: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - - lea rax,[rsp] - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - lea rsp,[((-160))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 - movaps XMMWORD[32+rsp],xmm8 - movaps XMMWORD[48+rsp],xmm9 - movaps XMMWORD[64+rsp],xmm10 - movaps XMMWORD[80+rsp],xmm11 - movaps XMMWORD[96+rsp],xmm12 - movaps XMMWORD[112+rsp],xmm13 - movaps XMMWORD[128+rsp],xmm14 - movaps XMMWORD[144+rsp],xmm15 -$L$ocb_enc_body: - mov rbx,QWORD[56+rax] - mov rbp,QWORD[((56+8))+rax] - - mov r10d,DWORD[240+rcx] - mov r11,rcx - shl r10d,4 - movups xmm9,XMMWORD[rcx] - movups xmm1,XMMWORD[16+r10*1+rcx] - - movdqu xmm15,XMMWORD[r9] - pxor xmm9,xmm1 - pxor xmm15,xmm1 - - mov eax,16+32 - lea rcx,[32+r10*1+r11] - movups xmm1,XMMWORD[16+r11] - sub rax,r10 - mov r10,rax - - movdqu xmm10,XMMWORD[rbx] - movdqu xmm8,XMMWORD[rbp] - - test r8,1 - jnz NEAR $L$ocb_enc_odd - - bsf r12,r8 - add r8,1 - shl r12,4 - movdqu xmm7,XMMWORD[r12*1+rbx] - movdqu xmm2,XMMWORD[rdi] - lea rdi,[16+rdi] - - call __ocb_encrypt1 - - movdqa xmm15,xmm7 - movups XMMWORD[rsi],xmm2 - lea rsi,[16+rsi] - sub rdx,1 - jz NEAR $L$ocb_enc_done - -$L$ocb_enc_odd: - lea r12,[1+r8] - lea r13,[3+r8] - lea r14,[5+r8] - lea r8,[6+r8] - bsf r12,r12 - bsf r13,r13 - bsf r14,r14 - shl r12,4 - shl r13,4 - shl r14,4 - - sub rdx,6 - jc NEAR $L$ocb_enc_short - jmp NEAR $L$ocb_enc_grandloop - -ALIGN 32 -$L$ocb_enc_grandloop: - movdqu xmm2,XMMWORD[rdi] - movdqu xmm3,XMMWORD[16+rdi] - movdqu xmm4,XMMWORD[32+rdi] - movdqu xmm5,XMMWORD[48+rdi] - movdqu xmm6,XMMWORD[64+rdi] - movdqu xmm7,XMMWORD[80+rdi] - lea rdi,[96+rdi] - - call __ocb_encrypt6 - - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - movups XMMWORD[32+rsi],xmm4 - movups XMMWORD[48+rsi],xmm5 - movups XMMWORD[64+rsi],xmm6 - movups XMMWORD[80+rsi],xmm7 - lea rsi,[96+rsi] - sub rdx,6 - jnc NEAR $L$ocb_enc_grandloop - -$L$ocb_enc_short: - add rdx,6 - jz NEAR $L$ocb_enc_done - - movdqu xmm2,XMMWORD[rdi] - cmp rdx,2 - jb NEAR $L$ocb_enc_one - movdqu xmm3,XMMWORD[16+rdi] - je NEAR $L$ocb_enc_two - - movdqu xmm4,XMMWORD[32+rdi] - cmp rdx,4 - jb NEAR $L$ocb_enc_three - movdqu xmm5,XMMWORD[48+rdi] - je NEAR $L$ocb_enc_four - - movdqu xmm6,XMMWORD[64+rdi] - pxor xmm7,xmm7 - - call __ocb_encrypt6 - - movdqa xmm15,xmm14 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - movups XMMWORD[32+rsi],xmm4 - movups XMMWORD[48+rsi],xmm5 - movups XMMWORD[64+rsi],xmm6 - - jmp NEAR $L$ocb_enc_done - -ALIGN 16 -$L$ocb_enc_one: - movdqa xmm7,xmm10 - - call __ocb_encrypt1 - - movdqa xmm15,xmm7 - movups XMMWORD[rsi],xmm2 - jmp NEAR $L$ocb_enc_done - -ALIGN 16 -$L$ocb_enc_two: - pxor xmm4,xmm4 - pxor xmm5,xmm5 - - call __ocb_encrypt4 - - movdqa xmm15,xmm11 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - - jmp NEAR $L$ocb_enc_done - -ALIGN 16 -$L$ocb_enc_three: - pxor xmm5,xmm5 - - call __ocb_encrypt4 - - movdqa xmm15,xmm12 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - movups XMMWORD[32+rsi],xmm4 - - jmp NEAR $L$ocb_enc_done - -ALIGN 16 -$L$ocb_enc_four: - call __ocb_encrypt4 - - movdqa xmm15,xmm13 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - movups XMMWORD[32+rsi],xmm4 - movups XMMWORD[48+rsi],xmm5 - -$L$ocb_enc_done: - pxor xmm15,xmm0 - movdqu XMMWORD[rbp],xmm8 - movdqu XMMWORD[r9],xmm15 - - xorps xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - movaps xmm6,XMMWORD[rsp] - movaps XMMWORD[rsp],xmm0 - movaps xmm7,XMMWORD[16+rsp] - movaps XMMWORD[16+rsp],xmm0 - movaps xmm8,XMMWORD[32+rsp] - movaps XMMWORD[32+rsp],xmm0 - movaps xmm9,XMMWORD[48+rsp] - movaps XMMWORD[48+rsp],xmm0 - movaps xmm10,XMMWORD[64+rsp] - movaps XMMWORD[64+rsp],xmm0 - movaps xmm11,XMMWORD[80+rsp] - movaps XMMWORD[80+rsp],xmm0 - movaps xmm12,XMMWORD[96+rsp] - movaps XMMWORD[96+rsp],xmm0 - movaps xmm13,XMMWORD[112+rsp] - movaps XMMWORD[112+rsp],xmm0 - movaps xmm14,XMMWORD[128+rsp] - movaps XMMWORD[128+rsp],xmm0 - movaps xmm15,XMMWORD[144+rsp] - movaps XMMWORD[144+rsp],xmm0 - lea rax,[((160+40))+rsp] -$L$ocb_enc_pop: - mov r14,QWORD[((-40))+rax] - - mov r13,QWORD[((-32))+rax] - - mov r12,QWORD[((-24))+rax] - - mov rbp,QWORD[((-16))+rax] - - mov rbx,QWORD[((-8))+rax] - - lea rsp,[rax] - -$L$ocb_enc_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_aes_hw_ocb_encrypt: - - -ALIGN 32 -__ocb_encrypt6: - pxor xmm15,xmm9 - movdqu xmm11,XMMWORD[r12*1+rbx] - movdqa xmm12,xmm10 - movdqu xmm13,XMMWORD[r13*1+rbx] - movdqa xmm14,xmm10 - pxor xmm10,xmm15 - movdqu xmm15,XMMWORD[r14*1+rbx] - pxor xmm11,xmm10 - pxor xmm8,xmm2 - pxor xmm2,xmm10 - pxor xmm12,xmm11 - pxor xmm8,xmm3 - pxor xmm3,xmm11 - pxor xmm13,xmm12 - pxor xmm8,xmm4 - pxor xmm4,xmm12 - pxor xmm14,xmm13 - pxor xmm8,xmm5 - pxor xmm5,xmm13 - pxor xmm15,xmm14 - pxor xmm8,xmm6 - pxor xmm6,xmm14 - pxor xmm8,xmm7 - pxor xmm7,xmm15 - movups xmm0,XMMWORD[32+r11] - - lea r12,[1+r8] - lea r13,[3+r8] - lea r14,[5+r8] - add r8,6 - pxor xmm10,xmm9 - bsf r12,r12 - bsf r13,r13 - bsf r14,r14 - -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,220,225 -DB 102,15,56,220,233 - pxor xmm11,xmm9 - pxor xmm12,xmm9 -DB 102,15,56,220,241 - pxor xmm13,xmm9 - pxor xmm14,xmm9 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[48+r11] - pxor xmm15,xmm9 - -DB 102,15,56,220,208 -DB 102,15,56,220,216 -DB 102,15,56,220,224 -DB 102,15,56,220,232 -DB 102,15,56,220,240 -DB 102,15,56,220,248 - movups xmm0,XMMWORD[64+r11] - shl r12,4 - shl r13,4 - jmp NEAR $L$ocb_enc_loop6 - -ALIGN 32 -$L$ocb_enc_loop6: -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,220,225 -DB 102,15,56,220,233 -DB 102,15,56,220,241 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 - -DB 102,15,56,220,208 -DB 102,15,56,220,216 -DB 102,15,56,220,224 -DB 102,15,56,220,232 -DB 102,15,56,220,240 -DB 102,15,56,220,248 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ocb_enc_loop6 - -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,220,225 -DB 102,15,56,220,233 -DB 102,15,56,220,241 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[16+r11] - shl r14,4 - -DB 102,65,15,56,221,210 - movdqu xmm10,XMMWORD[rbx] - mov rax,r10 -DB 102,65,15,56,221,219 -DB 102,65,15,56,221,228 -DB 102,65,15,56,221,237 -DB 102,65,15,56,221,246 -DB 102,65,15,56,221,255 - DB 0F3h,0C3h ;repret - - - -ALIGN 32 -__ocb_encrypt4: - pxor xmm15,xmm9 - movdqu xmm11,XMMWORD[r12*1+rbx] - movdqa xmm12,xmm10 - movdqu xmm13,XMMWORD[r13*1+rbx] - pxor xmm10,xmm15 - pxor xmm11,xmm10 - pxor xmm8,xmm2 - pxor xmm2,xmm10 - pxor xmm12,xmm11 - pxor xmm8,xmm3 - pxor xmm3,xmm11 - pxor xmm13,xmm12 - pxor xmm8,xmm4 - pxor xmm4,xmm12 - pxor xmm8,xmm5 - pxor xmm5,xmm13 - movups xmm0,XMMWORD[32+r11] - - pxor xmm10,xmm9 - pxor xmm11,xmm9 - pxor xmm12,xmm9 - pxor xmm13,xmm9 - -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,220,225 -DB 102,15,56,220,233 - movups xmm1,XMMWORD[48+r11] - -DB 102,15,56,220,208 -DB 102,15,56,220,216 -DB 102,15,56,220,224 -DB 102,15,56,220,232 - movups xmm0,XMMWORD[64+r11] - jmp NEAR $L$ocb_enc_loop4 - -ALIGN 32 -$L$ocb_enc_loop4: -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,220,225 -DB 102,15,56,220,233 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 - -DB 102,15,56,220,208 -DB 102,15,56,220,216 -DB 102,15,56,220,224 -DB 102,15,56,220,232 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ocb_enc_loop4 - -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,220,225 -DB 102,15,56,220,233 - movups xmm1,XMMWORD[16+r11] - mov rax,r10 - -DB 102,65,15,56,221,210 -DB 102,65,15,56,221,219 -DB 102,65,15,56,221,228 -DB 102,65,15,56,221,237 - DB 0F3h,0C3h ;repret - - - -ALIGN 32 -__ocb_encrypt1: - pxor xmm7,xmm15 - pxor xmm7,xmm9 - pxor xmm8,xmm2 - pxor xmm2,xmm7 - movups xmm0,XMMWORD[32+r11] - -DB 102,15,56,220,209 - movups xmm1,XMMWORD[48+r11] - pxor xmm7,xmm9 - -DB 102,15,56,220,208 - movups xmm0,XMMWORD[64+r11] - jmp NEAR $L$ocb_enc_loop1 - -ALIGN 32 -$L$ocb_enc_loop1: -DB 102,15,56,220,209 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 - -DB 102,15,56,220,208 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ocb_enc_loop1 - -DB 102,15,56,220,209 - movups xmm1,XMMWORD[16+r11] - mov rax,r10 - -DB 102,15,56,221,215 - DB 0F3h,0C3h ;repret - - -global aes_hw_ocb_decrypt - -ALIGN 32 -aes_hw_ocb_decrypt: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aes_hw_ocb_decrypt: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - - lea rax,[rsp] - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - lea rsp,[((-160))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 - movaps XMMWORD[32+rsp],xmm8 - movaps XMMWORD[48+rsp],xmm9 - movaps XMMWORD[64+rsp],xmm10 - movaps XMMWORD[80+rsp],xmm11 - movaps XMMWORD[96+rsp],xmm12 - movaps XMMWORD[112+rsp],xmm13 - movaps XMMWORD[128+rsp],xmm14 - movaps XMMWORD[144+rsp],xmm15 -$L$ocb_dec_body: - mov rbx,QWORD[56+rax] - mov rbp,QWORD[((56+8))+rax] - - mov r10d,DWORD[240+rcx] - mov r11,rcx - shl r10d,4 - movups xmm9,XMMWORD[rcx] - movups xmm1,XMMWORD[16+r10*1+rcx] - - movdqu xmm15,XMMWORD[r9] - pxor xmm9,xmm1 - pxor xmm15,xmm1 - - mov eax,16+32 - lea rcx,[32+r10*1+r11] - movups xmm1,XMMWORD[16+r11] - sub rax,r10 - mov r10,rax - - movdqu xmm10,XMMWORD[rbx] - movdqu xmm8,XMMWORD[rbp] - - test r8,1 - jnz NEAR $L$ocb_dec_odd - - bsf r12,r8 - add r8,1 - shl r12,4 - movdqu xmm7,XMMWORD[r12*1+rbx] - movdqu xmm2,XMMWORD[rdi] - lea rdi,[16+rdi] - - call __ocb_decrypt1 - - movdqa xmm15,xmm7 - movups XMMWORD[rsi],xmm2 - xorps xmm8,xmm2 - lea rsi,[16+rsi] - sub rdx,1 - jz NEAR $L$ocb_dec_done - -$L$ocb_dec_odd: - lea r12,[1+r8] - lea r13,[3+r8] - lea r14,[5+r8] - lea r8,[6+r8] - bsf r12,r12 - bsf r13,r13 - bsf r14,r14 - shl r12,4 - shl r13,4 - shl r14,4 - - sub rdx,6 - jc NEAR $L$ocb_dec_short - jmp NEAR $L$ocb_dec_grandloop - -ALIGN 32 -$L$ocb_dec_grandloop: - movdqu xmm2,XMMWORD[rdi] - movdqu xmm3,XMMWORD[16+rdi] - movdqu xmm4,XMMWORD[32+rdi] - movdqu xmm5,XMMWORD[48+rdi] - movdqu xmm6,XMMWORD[64+rdi] - movdqu xmm7,XMMWORD[80+rdi] - lea rdi,[96+rdi] - - call __ocb_decrypt6 - - movups XMMWORD[rsi],xmm2 - pxor xmm8,xmm2 - movups XMMWORD[16+rsi],xmm3 - pxor xmm8,xmm3 - movups XMMWORD[32+rsi],xmm4 - pxor xmm8,xmm4 - movups XMMWORD[48+rsi],xmm5 - pxor xmm8,xmm5 - movups XMMWORD[64+rsi],xmm6 - pxor xmm8,xmm6 - movups XMMWORD[80+rsi],xmm7 - pxor xmm8,xmm7 - lea rsi,[96+rsi] - sub rdx,6 - jnc NEAR $L$ocb_dec_grandloop - -$L$ocb_dec_short: - add rdx,6 - jz NEAR $L$ocb_dec_done - - movdqu xmm2,XMMWORD[rdi] - cmp rdx,2 - jb NEAR $L$ocb_dec_one - movdqu xmm3,XMMWORD[16+rdi] - je NEAR $L$ocb_dec_two - - movdqu xmm4,XMMWORD[32+rdi] - cmp rdx,4 - jb NEAR $L$ocb_dec_three - movdqu xmm5,XMMWORD[48+rdi] - je NEAR $L$ocb_dec_four - - movdqu xmm6,XMMWORD[64+rdi] - pxor xmm7,xmm7 - - call __ocb_decrypt6 - - movdqa xmm15,xmm14 - movups XMMWORD[rsi],xmm2 - pxor xmm8,xmm2 - movups XMMWORD[16+rsi],xmm3 - pxor xmm8,xmm3 - movups XMMWORD[32+rsi],xmm4 - pxor xmm8,xmm4 - movups XMMWORD[48+rsi],xmm5 - pxor xmm8,xmm5 - movups XMMWORD[64+rsi],xmm6 - pxor xmm8,xmm6 - - jmp NEAR $L$ocb_dec_done - -ALIGN 16 -$L$ocb_dec_one: - movdqa xmm7,xmm10 - - call __ocb_decrypt1 - - movdqa xmm15,xmm7 - movups XMMWORD[rsi],xmm2 - xorps xmm8,xmm2 - jmp NEAR $L$ocb_dec_done - -ALIGN 16 -$L$ocb_dec_two: - pxor xmm4,xmm4 - pxor xmm5,xmm5 - - call __ocb_decrypt4 - - movdqa xmm15,xmm11 - movups XMMWORD[rsi],xmm2 - xorps xmm8,xmm2 - movups XMMWORD[16+rsi],xmm3 - xorps xmm8,xmm3 - - jmp NEAR $L$ocb_dec_done - -ALIGN 16 -$L$ocb_dec_three: - pxor xmm5,xmm5 - - call __ocb_decrypt4 +ALIGN 32 +$L$ctr32_loop3: +DB 102,15,56,220,209 + lea rcx,[16+rcx] + dec eax +DB 102,15,56,220,217 +DB 102,15,56,220,225 + movups xmm1,XMMWORD[rcx] + jnz NEAR $L$ctr32_loop3 +DB 102,15,56,221,209 +DB 102,15,56,221,217 +DB 102,15,56,221,225 - movdqa xmm15,xmm12 + movups xmm10,XMMWORD[rdi] + xorps xmm2,xmm10 movups XMMWORD[rsi],xmm2 - xorps xmm8,xmm2 - movups XMMWORD[16+rsi],xmm3 - xorps xmm8,xmm3 - movups XMMWORD[32+rsi],xmm4 - xorps xmm8,xmm4 - - jmp NEAR $L$ocb_dec_done - -ALIGN 16 -$L$ocb_dec_four: - call __ocb_decrypt4 + cmp rdx,2 + jb NEAR $L$ctr32_done - movdqa xmm15,xmm13 - movups XMMWORD[rsi],xmm2 - pxor xmm8,xmm2 + movups xmm11,XMMWORD[16+rdi] + xorps xmm3,xmm11 movups XMMWORD[16+rsi],xmm3 - pxor xmm8,xmm3 - movups XMMWORD[32+rsi],xmm4 - pxor xmm8,xmm4 - movups XMMWORD[48+rsi],xmm5 - pxor xmm8,xmm5 + je NEAR $L$ctr32_done -$L$ocb_dec_done: - pxor xmm15,xmm0 - movdqu XMMWORD[rbp],xmm8 - movdqu XMMWORD[r9],xmm15 + movups xmm12,XMMWORD[32+rdi] + xorps xmm4,xmm12 + movups XMMWORD[32+rsi],xmm4 +$L$ctr32_done: xorps xmm0,xmm0 + xor ebp,ebp pxor xmm1,xmm1 pxor xmm2,xmm2 pxor xmm3,xmm3 pxor xmm4,xmm4 pxor xmm5,xmm5 - movaps xmm6,XMMWORD[rsp] + movaps xmm6,XMMWORD[((-168))+r11] + movaps XMMWORD[(-168)+r11],xmm0 + movaps xmm7,XMMWORD[((-152))+r11] + movaps XMMWORD[(-152)+r11],xmm0 + movaps xmm8,XMMWORD[((-136))+r11] + movaps XMMWORD[(-136)+r11],xmm0 + movaps xmm9,XMMWORD[((-120))+r11] + movaps XMMWORD[(-120)+r11],xmm0 + movaps xmm10,XMMWORD[((-104))+r11] + movaps XMMWORD[(-104)+r11],xmm0 + movaps xmm11,XMMWORD[((-88))+r11] + movaps XMMWORD[(-88)+r11],xmm0 + movaps xmm12,XMMWORD[((-72))+r11] + movaps XMMWORD[(-72)+r11],xmm0 + movaps xmm13,XMMWORD[((-56))+r11] + movaps XMMWORD[(-56)+r11],xmm0 + movaps xmm14,XMMWORD[((-40))+r11] + movaps XMMWORD[(-40)+r11],xmm0 + movaps xmm15,XMMWORD[((-24))+r11] + movaps XMMWORD[(-24)+r11],xmm0 movaps XMMWORD[rsp],xmm0 - movaps xmm7,XMMWORD[16+rsp] movaps XMMWORD[16+rsp],xmm0 - movaps xmm8,XMMWORD[32+rsp] movaps XMMWORD[32+rsp],xmm0 - movaps xmm9,XMMWORD[48+rsp] movaps XMMWORD[48+rsp],xmm0 - movaps xmm10,XMMWORD[64+rsp] movaps XMMWORD[64+rsp],xmm0 - movaps xmm11,XMMWORD[80+rsp] movaps XMMWORD[80+rsp],xmm0 - movaps xmm12,XMMWORD[96+rsp] movaps XMMWORD[96+rsp],xmm0 - movaps xmm13,XMMWORD[112+rsp] movaps XMMWORD[112+rsp],xmm0 - movaps xmm14,XMMWORD[128+rsp] - movaps XMMWORD[128+rsp],xmm0 - movaps xmm15,XMMWORD[144+rsp] - movaps XMMWORD[144+rsp],xmm0 - lea rax,[((160+40))+rsp] -$L$ocb_dec_pop: - mov r14,QWORD[((-40))+rax] - - mov r13,QWORD[((-32))+rax] - - mov r12,QWORD[((-24))+rax] - - mov rbp,QWORD[((-16))+rax] - - mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-8))+r11] - lea rsp,[rax] + lea rsp,[r11] -$L$ocb_dec_epilogue: +$L$ctr32_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_aes_hw_ocb_decrypt: - - -ALIGN 32 -__ocb_decrypt6: - pxor xmm15,xmm9 - movdqu xmm11,XMMWORD[r12*1+rbx] - movdqa xmm12,xmm10 - movdqu xmm13,XMMWORD[r13*1+rbx] - movdqa xmm14,xmm10 - pxor xmm10,xmm15 - movdqu xmm15,XMMWORD[r14*1+rbx] - pxor xmm11,xmm10 - pxor xmm2,xmm10 - pxor xmm12,xmm11 - pxor xmm3,xmm11 - pxor xmm13,xmm12 - pxor xmm4,xmm12 - pxor xmm14,xmm13 - pxor xmm5,xmm13 - pxor xmm15,xmm14 - pxor xmm6,xmm14 - pxor xmm7,xmm15 - movups xmm0,XMMWORD[32+r11] - - lea r12,[1+r8] - lea r13,[3+r8] - lea r14,[5+r8] - add r8,6 - pxor xmm10,xmm9 - bsf r12,r12 - bsf r13,r13 - bsf r14,r14 - -DB 102,15,56,222,209 -DB 102,15,56,222,217 -DB 102,15,56,222,225 -DB 102,15,56,222,233 - pxor xmm11,xmm9 - pxor xmm12,xmm9 -DB 102,15,56,222,241 - pxor xmm13,xmm9 - pxor xmm14,xmm9 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[48+r11] - pxor xmm15,xmm9 - -DB 102,15,56,222,208 -DB 102,15,56,222,216 -DB 102,15,56,222,224 -DB 102,15,56,222,232 -DB 102,15,56,222,240 -DB 102,15,56,222,248 - movups xmm0,XMMWORD[64+r11] - shl r12,4 - shl r13,4 - jmp NEAR $L$ocb_dec_loop6 - -ALIGN 32 -$L$ocb_dec_loop6: -DB 102,15,56,222,209 -DB 102,15,56,222,217 -DB 102,15,56,222,225 -DB 102,15,56,222,233 -DB 102,15,56,222,241 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 - -DB 102,15,56,222,208 -DB 102,15,56,222,216 -DB 102,15,56,222,224 -DB 102,15,56,222,232 -DB 102,15,56,222,240 -DB 102,15,56,222,248 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ocb_dec_loop6 - -DB 102,15,56,222,209 -DB 102,15,56,222,217 -DB 102,15,56,222,225 -DB 102,15,56,222,233 -DB 102,15,56,222,241 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[16+r11] - shl r14,4 - -DB 102,65,15,56,223,210 - movdqu xmm10,XMMWORD[rbx] - mov rax,r10 -DB 102,65,15,56,223,219 -DB 102,65,15,56,223,228 -DB 102,65,15,56,223,237 -DB 102,65,15,56,223,246 -DB 102,65,15,56,223,255 - DB 0F3h,0C3h ;repret - - - -ALIGN 32 -__ocb_decrypt4: - pxor xmm15,xmm9 - movdqu xmm11,XMMWORD[r12*1+rbx] - movdqa xmm12,xmm10 - movdqu xmm13,XMMWORD[r13*1+rbx] - pxor xmm10,xmm15 - pxor xmm11,xmm10 - pxor xmm2,xmm10 - pxor xmm12,xmm11 - pxor xmm3,xmm11 - pxor xmm13,xmm12 - pxor xmm4,xmm12 - pxor xmm5,xmm13 - movups xmm0,XMMWORD[32+r11] - - pxor xmm10,xmm9 - pxor xmm11,xmm9 - pxor xmm12,xmm9 - pxor xmm13,xmm9 - -DB 102,15,56,222,209 -DB 102,15,56,222,217 -DB 102,15,56,222,225 -DB 102,15,56,222,233 - movups xmm1,XMMWORD[48+r11] - -DB 102,15,56,222,208 -DB 102,15,56,222,216 -DB 102,15,56,222,224 -DB 102,15,56,222,232 - movups xmm0,XMMWORD[64+r11] - jmp NEAR $L$ocb_dec_loop4 - -ALIGN 32 -$L$ocb_dec_loop4: -DB 102,15,56,222,209 -DB 102,15,56,222,217 -DB 102,15,56,222,225 -DB 102,15,56,222,233 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 - -DB 102,15,56,222,208 -DB 102,15,56,222,216 -DB 102,15,56,222,224 -DB 102,15,56,222,232 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ocb_dec_loop4 - -DB 102,15,56,222,209 -DB 102,15,56,222,217 -DB 102,15,56,222,225 -DB 102,15,56,222,233 - movups xmm1,XMMWORD[16+r11] - mov rax,r10 - -DB 102,65,15,56,223,210 -DB 102,65,15,56,223,219 -DB 102,65,15,56,223,228 -DB 102,65,15,56,223,237 - DB 0F3h,0C3h ;repret - - - -ALIGN 32 -__ocb_decrypt1: - pxor xmm7,xmm15 - pxor xmm7,xmm9 - pxor xmm2,xmm7 - movups xmm0,XMMWORD[32+r11] - -DB 102,15,56,222,209 - movups xmm1,XMMWORD[48+r11] - pxor xmm7,xmm9 - -DB 102,15,56,222,208 - movups xmm0,XMMWORD[64+r11] - jmp NEAR $L$ocb_dec_loop1 - -ALIGN 32 -$L$ocb_dec_loop1: -DB 102,15,56,222,209 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 - -DB 102,15,56,222,208 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ocb_dec_loop1 - -DB 102,15,56,222,209 - movups xmm1,XMMWORD[16+r11] - mov rax,r10 - -DB 102,15,56,223,215 - DB 0F3h,0C3h ;repret - +$L$SEH_end_aes_hw_ctr32_encrypt_blocks: global aes_hw_cbc_encrypt ALIGN 16 @@ -3744,12 +1563,12 @@ $L$cbc_enc_loop: xorps xmm3,xmm0 lea rcx,[32+rcx] xorps xmm2,xmm3 -$L$oop_enc1_15: +$L$oop_enc1_6: DB 102,15,56,220,209 dec eax movups xmm1,XMMWORD[rcx] lea rcx,[16+rcx] - jnz NEAR $L$oop_enc1_15 + jnz NEAR $L$oop_enc1_6 DB 102,15,56,221,209 mov eax,r10d mov rcx,r11 @@ -3795,12 +1614,12 @@ $L$cbc_decrypt: movups xmm1,XMMWORD[16+rcx] lea rcx,[32+rcx] xorps xmm2,xmm0 -$L$oop_dec1_16: +$L$oop_dec1_7: DB 102,15,56,222,209 dec r10d movups xmm1,XMMWORD[rcx] lea rcx,[16+rcx] - jnz NEAR $L$oop_dec1_16 + jnz NEAR $L$oop_dec1_7 DB 102,15,56,223,209 pxor xmm0,xmm0 pxor xmm1,xmm1 @@ -4224,12 +2043,12 @@ $L$cbc_dec_one: movups xmm1,XMMWORD[16+rcx] lea rcx,[32+rcx] xorps xmm2,xmm0 -$L$oop_dec1_17: +$L$oop_dec1_8: DB 102,15,56,222,209 dec eax movups xmm1,XMMWORD[rcx] lea rcx,[16+rcx] - jnz NEAR $L$oop_dec1_17 + jnz NEAR $L$oop_dec1_8 DB 102,15,56,223,209 xorps xmm2,xmm10 movaps xmm10,xmm11 @@ -4873,64 +2692,6 @@ ctr_xts_se_handler: -ALIGN 16 -ocb_se_handler: - push rsi - push rdi - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - pushfq - sub rsp,64 - - mov rax,QWORD[120+r8] - mov rbx,QWORD[248+r8] - - mov rsi,QWORD[8+r9] - mov r11,QWORD[56+r9] - - mov r10d,DWORD[r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jb NEAR $L$common_seh_tail - - mov r10d,DWORD[4+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jae NEAR $L$common_seh_tail - - mov r10d,DWORD[8+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jae NEAR $L$ocb_no_xmm - - mov rax,QWORD[152+r8] - - lea rsi,[rax] - lea rdi,[512+r8] - mov ecx,20 - DD 0xa548f3fc - lea rax,[((160+40))+rax] - -$L$ocb_no_xmm: - mov rbx,QWORD[((-8))+rax] - mov rbp,QWORD[((-16))+rax] - mov r12,QWORD[((-24))+rax] - mov r13,QWORD[((-32))+rax] - mov r14,QWORD[((-40))+rax] - - mov QWORD[144+r8],rbx - mov QWORD[160+r8],rbp - mov QWORD[216+r8],r12 - mov QWORD[224+r8],r13 - mov QWORD[232+r8],r14 - - jmp NEAR $L$common_seh_tail - - ALIGN 16 cbc_se_handler: push rsi @@ -5019,33 +2780,9 @@ ALIGN 4 DD $L$SEH_end_aes_hw_ecb_encrypt wrt ..imagebase DD $L$SEH_info_ecb wrt ..imagebase - DD $L$SEH_begin_aes_hw_ccm64_encrypt_blocks wrt ..imagebase - DD $L$SEH_end_aes_hw_ccm64_encrypt_blocks wrt ..imagebase - DD $L$SEH_info_ccm64_enc wrt ..imagebase - - DD $L$SEH_begin_aes_hw_ccm64_decrypt_blocks wrt ..imagebase - DD $L$SEH_end_aes_hw_ccm64_decrypt_blocks wrt ..imagebase - DD $L$SEH_info_ccm64_dec wrt ..imagebase - DD $L$SEH_begin_aes_hw_ctr32_encrypt_blocks wrt ..imagebase DD $L$SEH_end_aes_hw_ctr32_encrypt_blocks wrt ..imagebase DD $L$SEH_info_ctr32 wrt ..imagebase - - DD $L$SEH_begin_aes_hw_xts_encrypt wrt ..imagebase - DD $L$SEH_end_aes_hw_xts_encrypt wrt ..imagebase - DD $L$SEH_info_xts_enc wrt ..imagebase - - DD $L$SEH_begin_aes_hw_xts_decrypt wrt ..imagebase - DD $L$SEH_end_aes_hw_xts_decrypt wrt ..imagebase - DD $L$SEH_info_xts_dec wrt ..imagebase - - DD $L$SEH_begin_aes_hw_ocb_encrypt wrt ..imagebase - DD $L$SEH_end_aes_hw_ocb_encrypt wrt ..imagebase - DD $L$SEH_info_ocb_enc wrt ..imagebase - - DD $L$SEH_begin_aes_hw_ocb_decrypt wrt ..imagebase - DD $L$SEH_end_aes_hw_ocb_decrypt wrt ..imagebase - DD $L$SEH_info_ocb_dec wrt ..imagebase DD $L$SEH_begin_aes_hw_cbc_encrypt wrt ..imagebase DD $L$SEH_end_aes_hw_cbc_encrypt wrt ..imagebase DD $L$SEH_info_cbc wrt ..imagebase @@ -5063,38 +2800,10 @@ $L$SEH_info_ecb: DB 9,0,0,0 DD ecb_ccm64_se_handler wrt ..imagebase DD $L$ecb_enc_body wrt ..imagebase,$L$ecb_enc_ret wrt ..imagebase -$L$SEH_info_ccm64_enc: -DB 9,0,0,0 - DD ecb_ccm64_se_handler wrt ..imagebase - DD $L$ccm64_enc_body wrt ..imagebase,$L$ccm64_enc_ret wrt ..imagebase -$L$SEH_info_ccm64_dec: -DB 9,0,0,0 - DD ecb_ccm64_se_handler wrt ..imagebase - DD $L$ccm64_dec_body wrt ..imagebase,$L$ccm64_dec_ret wrt ..imagebase $L$SEH_info_ctr32: DB 9,0,0,0 DD ctr_xts_se_handler wrt ..imagebase DD $L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase -$L$SEH_info_xts_enc: -DB 9,0,0,0 - DD ctr_xts_se_handler wrt ..imagebase - DD $L$xts_enc_body wrt ..imagebase,$L$xts_enc_epilogue wrt ..imagebase -$L$SEH_info_xts_dec: -DB 9,0,0,0 - DD ctr_xts_se_handler wrt ..imagebase - DD $L$xts_dec_body wrt ..imagebase,$L$xts_dec_epilogue wrt ..imagebase -$L$SEH_info_ocb_enc: -DB 9,0,0,0 - DD ocb_se_handler wrt ..imagebase - DD $L$ocb_enc_body wrt ..imagebase,$L$ocb_enc_epilogue wrt ..imagebase - DD $L$ocb_enc_pop wrt ..imagebase - DD 0 -$L$SEH_info_ocb_dec: -DB 9,0,0,0 - DD ocb_se_handler wrt ..imagebase - DD $L$ocb_dec_body wrt ..imagebase,$L$ocb_dec_epilogue wrt ..imagebase - DD $L$ocb_dec_pop wrt ..imagebase - DD 0 $L$SEH_info_cbc: DB 9,0,0,0 DD cbc_se_handler wrt ..imagebase diff --git a/third_party/boringssl/win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm b/third_party/boringssl/win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm index a34249b92ea785..89b91de10d1dbd 100644 --- a/third_party/boringssl/win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm +++ b/third_party/boringssl/win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm @@ -19,23 +19,12 @@ global CRYPTO_rdrand ALIGN 16 CRYPTO_rdrand: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_CRYPTO_rdrand: - mov rdi,rcx - - xor rax,rax - - -DB 0x48,0x0f,0xc7,0xf1 +DB 73,15,199,240 adc rax,rax - mov QWORD[rdi],rcx - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] + mov QWORD[rcx],r8 DB 0F3h,0C3h ;repret @@ -43,39 +32,27 @@ DB 0x48,0x0f,0xc7,0xf1 + global CRYPTO_rdrand_multiple8_buf ALIGN 16 CRYPTO_rdrand_multiple8_buf: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_CRYPTO_rdrand_multiple8_buf: - mov rdi,rcx - mov rsi,rdx - - - test rsi,rsi + test rdx,rdx jz NEAR $L$out - mov rdx,8 + mov r8,8 $L$loop: - - -DB 0x48,0x0f,0xc7,0xf1 +DB 73,15,199,241 jnc NEAR $L$err - mov QWORD[rdi],rcx - add rdi,rdx - sub rsi,rdx + mov QWORD[rcx],r9 + add rcx,r8 + sub rdx,r8 jnz NEAR $L$loop $L$out: mov rax,1 - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$err: xor rax,rax - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + diff --git a/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont5.asm b/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont5.asm index 298fd0576a085d..7a1d5dbd9c489b 100644 --- a/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont5.asm +++ b/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont5.asm @@ -591,6 +591,7 @@ $L$SEH_end_bn_mul4x_mont_gather5: ALIGN 32 mul4x_internal: + shl r9,5 movd xmm5,DWORD[56+rax] lea rax,[$L$inc] @@ -1113,6 +1114,7 @@ $L$inner4x: mov r15,QWORD[24+rbp] jmp NEAR $L$sqr4x_sub_entry + global bn_power5 ALIGN 32 @@ -1340,6 +1342,7 @@ __bn_sqr8x_internal: + lea rbp,[32+r10] @@ -2045,8 +2048,10 @@ DB 102,73,15,126,217 DB 0F3h,0C3h ;repret + ALIGN 32 __bn_post4x_internal: + mov r12,QWORD[rbp] lea rbx,[r9*1+rdi] mov rcx,r9 @@ -2098,10 +2103,12 @@ $L$sqr4x_sub_entry: neg r9 DB 0F3h,0C3h ;repret + global bn_from_montgomery ALIGN 32 bn_from_montgomery: + test DWORD[48+rsp],7 jz NEAR bn_from_mont8x xor eax,eax @@ -2109,6 +2116,7 @@ bn_from_montgomery: + ALIGN 32 bn_from_mont8x: mov QWORD[8+rsp],rdi ;WIN64 prologue @@ -2418,6 +2426,7 @@ $L$SEH_end_bn_mulx4x_mont_gather5: ALIGN 32 mulx4x_internal: + mov QWORD[8+rsp],r9 mov r10,r9 neg r9 @@ -2838,6 +2847,7 @@ $L$mulx4x_inner: jmp NEAR $L$sqrx4x_sub_entry + ALIGN 32 bn_powerx5: mov QWORD[8+rsp],rdi ;WIN64 prologue @@ -3607,7 +3617,9 @@ DB 102,72,15,126,213 ALIGN 32 + __bn_postx4x_internal: + mov r12,QWORD[rbp] mov r10,rcx mov r9,rcx @@ -3656,10 +3668,12 @@ $L$sqrx4x_sub_entry: DB 0F3h,0C3h ;repret + global bn_scatter5 ALIGN 16 bn_scatter5: + cmp edx,0 jz NEAR $L$scatter_epilogue lea r8,[r9*8+r8] @@ -3674,13 +3688,16 @@ $L$scatter_epilogue: DB 0F3h,0C3h ;repret + global bn_gather5 ALIGN 32 bn_gather5: + $L$SEH_begin_bn_gather5: DB 0x4c,0x8d,0x14,0x24 + DB 0x48,0x81,0xec,0x08,0x01,0x00,0x00 lea rax,[$L$inc] and rsp,-16 @@ -3834,9 +3851,11 @@ $L$gather: jnz NEAR $L$gather lea rsp,[r10] + DB 0F3h,0C3h ;repret $L$SEH_end_bn_gather5: + ALIGN 64 $L$inc: DD 0,0,1,1