diff --git a/internal/sm2ec/p256_asm_amd64.s b/internal/sm2ec/p256_asm_amd64.s index 979fcc59..7069219d 100644 --- a/internal/sm2ec/p256_asm_amd64.s +++ b/internal/sm2ec/p256_asm_amd64.s @@ -2095,7 +2095,6 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 CMPB ·supportBMI2+0(SB), $0x01 JEQ internalMulBMI2 - // [t3, t2, t1, t0] * acc4 MOVQ acc4, mul0 MULQ t0 MOVQ mul0, acc0 @@ -2119,7 +2118,6 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 ADCQ $0, mul1 MOVQ mul1, acc4 - // [t3, t2, t1, t0] * acc5 MOVQ acc5, mul0 MULQ t0 ADDQ mul0, acc1 @@ -2150,7 +2148,6 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 ADCQ $0, mul1 MOVQ mul1, acc5 - // [t3, t2, t1, t0] * acc6 MOVQ acc6, mul0 MULQ t0 ADDQ mul0, acc2 @@ -2181,7 +2178,6 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 ADCQ $0, mul1 MOVQ mul1, acc6 - // [t3, t2, t1, t0] * acc7 MOVQ acc7, mul0 MULQ t0 ADDQ mul0, acc3 @@ -2211,8 +2207,6 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 ADDQ mul0, acc6 ADCQ $0, mul1 MOVQ mul1, acc7 - - // T = [acc7, acc6, acc5, acc4, acc3, acc2, acc1, acc0] // First reduction step MOVQ acc0, mul0 MOVQ acc0, mul1 @@ -2298,9 +2292,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8 CMOVQCS acc3, acc7 RET - internalMulBMI2: - // [t3, t2, t1, t0] * acc4 MOVQ acc4, mul1 MULXQ t0, acc0, acc1 @@ -2314,7 +2306,6 @@ internalMulBMI2: ADCQ mul0, acc3 ADCQ $0, acc4 - // [t3, t2, t1, t0] * acc5 MOVQ acc5, mul1 MULXQ t0, mul0, hlp ADDQ mul0, acc1 @@ -2335,7 +2326,6 @@ internalMulBMI2: ADDQ mul0, acc4 ADCQ $0, acc5 - // [t3, t2, t1, t0] * acc6 MOVQ acc6, mul1 MULXQ t0, mul0, hlp ADDQ mul0, acc2 @@ -2356,7 +2346,6 @@ internalMulBMI2: ADDQ mul0, acc5 ADCQ $0, acc6 - // [t3, t2, t1, t0] * acc7 MOVQ acc7, mul1 MULXQ t0, mul0, hlp ADDQ mul0, acc3 @@ -2377,7 +2366,6 @@ internalMulBMI2: ADDQ mul0, acc6 ADCQ $0, acc7 - // T = [acc7, acc6, acc5, acc4, acc3, acc2, acc1, acc0] // First reduction step MOVQ acc0, mul0 MOVQ acc0, mul1 @@ -2554,7 +2542,6 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 CMPB ·supportBMI2+0(SB), $0x01 JEQ internalSqrBMI2 - // [acc7, acc6, acc5] * acc4 MOVQ acc4, mul0 MULQ acc5 MOVQ mul0, acc1 @@ -2572,7 +2559,6 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 ADCQ $0, mul1 MOVQ mul1, t0 - // [acc7, acc6] * acc5 MOVQ acc5, mul0 MULQ acc6 ADDQ mul0, acc3 @@ -2587,7 +2573,6 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 ADCQ $0, mul1 MOVQ mul1, t1 - // acc7 * acc6 MOVQ acc6, mul0 MULQ acc7 ADDQ mul0, t1 @@ -2628,70 +2613,64 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8 ADCQ mul0, t2 ADCQ DX, t3 - // T = [t3, t2, t1, t0, acc3, acc2, acc1, acc0] sm2P256SqrReductionInternal() RET internalSqrBMI2: - XORQ t3, t3 - - // [acc7, acc6, acc5] * acc4 MOVQ acc4, mul1 MULXQ acc5, acc1, acc2 MULXQ acc6, mul0, acc3 - ADOXQ mul0, acc2 + ADDQ mul0, acc2 MULXQ acc7, mul0, t0 - ADOXQ mul0, acc3 - ADOXQ t3, t0 + ADCQ mul0, acc3 + ADCQ $0, t0 - // [acc7, acc6] * acc5 MOVQ acc5, mul1 MULXQ acc6, mul0, hlp - ADOXQ mul0, acc3 + ADDQ mul0, acc3 + ADCQ hlp, t0 MULXQ acc7, mul0, t1 - ADCXQ hlp, mul0 - ADOXQ mul0, t0 - ADCXQ t3, t1 + ADCQ $0, t1 + ADDQ mul0, t0 - // acc7 * acc6 MOVQ acc6, mul1 MULXQ acc7, mul0, t2 - ADOXQ mul0, t1 - ADOXQ t3, t2 - + ADCQ mul0, t1 + ADCQ $0, t2 + XORQ t3, t3 + // *2 - ADOXQ acc1, acc1 - ADOXQ acc2, acc2 - ADOXQ acc3, acc3 - ADOXQ t0, t0 - ADOXQ t1, t1 - ADOXQ t2, t2 - ADOXQ t3, t3 + ADDQ acc1, acc1 + ADCQ acc2, acc2 + ADCQ acc3, acc3 + ADCQ t0, t0 + ADCQ t1, t1 + ADCQ t2, t2 + ADCQ $0, t3 // Missing products MOVQ acc4, mul1 MULXQ mul1, acc0, acc4 - ADCXQ acc4, acc1 + ADDQ acc4, acc1 MOVQ acc5, mul1 MULXQ mul1, mul0, acc4 - ADCXQ mul0, acc2 - ADCXQ acc4, acc3 + ADCQ mul0, acc2 + ADCQ acc4, acc3 MOVQ acc6, mul1 MULXQ mul1, mul0, acc4 - ADCXQ mul0, t0 - ADCXQ acc4, t1 + ADCQ mul0, t0 + ADCQ acc4, t1 MOVQ acc7, mul1 MULXQ mul1, mul0, acc4 - ADCXQ mul0, t2 - ADCXQ acc4, t3 - - // T = [t3, t2, t1, t0, acc3, acc2, acc1, acc0] + ADCQ mul0, t2 + ADCQ acc4, t3 + sm2P256SqrReductionInternal() RET