Skip to content

Commit

Permalink
internal/sm2ec: not use ADX first
Browse files Browse the repository at this point in the history
  • Loading branch information
emmansun authored Jul 21, 2023
1 parent 5b5b26c commit 76131e6
Showing 1 changed file with 26 additions and 47 deletions.
73 changes: 26 additions & 47 deletions internal/sm2ec/p256_asm_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -2095,7 +2095,6 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
CMPB ·supportBMI2+0(SB), $0x01
JEQ internalMulBMI2

// [t3, t2, t1, t0] * acc4
MOVQ acc4, mul0
MULQ t0
MOVQ mul0, acc0
Expand All @@ -2119,7 +2118,6 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
ADCQ $0, mul1
MOVQ mul1, acc4

// [t3, t2, t1, t0] * acc5
MOVQ acc5, mul0
MULQ t0
ADDQ mul0, acc1
Expand Down Expand Up @@ -2150,7 +2148,6 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
ADCQ $0, mul1
MOVQ mul1, acc5

// [t3, t2, t1, t0] * acc6
MOVQ acc6, mul0
MULQ t0
ADDQ mul0, acc2
Expand Down Expand Up @@ -2181,7 +2178,6 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
ADCQ $0, mul1
MOVQ mul1, acc6

// [t3, t2, t1, t0] * acc7
MOVQ acc7, mul0
MULQ t0
ADDQ mul0, acc3
Expand Down Expand Up @@ -2211,8 +2207,6 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
ADDQ mul0, acc6
ADCQ $0, mul1
MOVQ mul1, acc7

// T = [acc7, acc6, acc5, acc4, acc3, acc2, acc1, acc0]
// First reduction step
MOVQ acc0, mul0
MOVQ acc0, mul1
Expand Down Expand Up @@ -2298,9 +2292,7 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
CMOVQCS acc3, acc7

RET

internalMulBMI2:
// [t3, t2, t1, t0] * acc4
MOVQ acc4, mul1
MULXQ t0, acc0, acc1

Expand All @@ -2314,7 +2306,6 @@ internalMulBMI2:
ADCQ mul0, acc3
ADCQ $0, acc4

// [t3, t2, t1, t0] * acc5
MOVQ acc5, mul1
MULXQ t0, mul0, hlp
ADDQ mul0, acc1
Expand All @@ -2335,7 +2326,6 @@ internalMulBMI2:
ADDQ mul0, acc4
ADCQ $0, acc5

// [t3, t2, t1, t0] * acc6
MOVQ acc6, mul1
MULXQ t0, mul0, hlp
ADDQ mul0, acc2
Expand All @@ -2356,7 +2346,6 @@ internalMulBMI2:
ADDQ mul0, acc5
ADCQ $0, acc6

// [t3, t2, t1, t0] * acc7
MOVQ acc7, mul1
MULXQ t0, mul0, hlp
ADDQ mul0, acc3
Expand All @@ -2377,7 +2366,6 @@ internalMulBMI2:
ADDQ mul0, acc6
ADCQ $0, acc7

// T = [acc7, acc6, acc5, acc4, acc3, acc2, acc1, acc0]
// First reduction step
MOVQ acc0, mul0
MOVQ acc0, mul1
Expand Down Expand Up @@ -2554,7 +2542,6 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8
CMPB ·supportBMI2+0(SB), $0x01
JEQ internalSqrBMI2

// [acc7, acc6, acc5] * acc4
MOVQ acc4, mul0
MULQ acc5
MOVQ mul0, acc1
Expand All @@ -2572,7 +2559,6 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8
ADCQ $0, mul1
MOVQ mul1, t0

// [acc7, acc6] * acc5
MOVQ acc5, mul0
MULQ acc6
ADDQ mul0, acc3
Expand All @@ -2587,7 +2573,6 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8
ADCQ $0, mul1
MOVQ mul1, t1

// acc7 * acc6
MOVQ acc6, mul0
MULQ acc7
ADDQ mul0, t1
Expand Down Expand Up @@ -2628,70 +2613,64 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8
ADCQ mul0, t2
ADCQ DX, t3

// T = [t3, t2, t1, t0, acc3, acc2, acc1, acc0]
sm2P256SqrReductionInternal()
RET

internalSqrBMI2:
XORQ t3, t3

// [acc7, acc6, acc5] * acc4
MOVQ acc4, mul1
MULXQ acc5, acc1, acc2

MULXQ acc6, mul0, acc3
ADOXQ mul0, acc2
ADDQ mul0, acc2

MULXQ acc7, mul0, t0
ADOXQ mul0, acc3
ADOXQ t3, t0
ADCQ mul0, acc3
ADCQ $0, t0

// [acc7, acc6] * acc5
MOVQ acc5, mul1
MULXQ acc6, mul0, hlp
ADOXQ mul0, acc3
ADDQ mul0, acc3
ADCQ hlp, t0

MULXQ acc7, mul0, t1
ADCXQ hlp, mul0
ADOXQ mul0, t0
ADCXQ t3, t1
ADCQ $0, t1
ADDQ mul0, t0

// acc7 * acc6
MOVQ acc6, mul1
MULXQ acc7, mul0, t2
ADOXQ mul0, t1
ADOXQ t3, t2

ADCQ mul0, t1
ADCQ $0, t2
XORQ t3, t3

// *2
ADOXQ acc1, acc1
ADOXQ acc2, acc2
ADOXQ acc3, acc3
ADOXQ t0, t0
ADOXQ t1, t1
ADOXQ t2, t2
ADOXQ t3, t3
ADDQ acc1, acc1
ADCQ acc2, acc2
ADCQ acc3, acc3
ADCQ t0, t0
ADCQ t1, t1
ADCQ t2, t2
ADCQ $0, t3

// Missing products
MOVQ acc4, mul1
MULXQ mul1, acc0, acc4
ADCXQ acc4, acc1
ADDQ acc4, acc1

MOVQ acc5, mul1
MULXQ mul1, mul0, acc4
ADCXQ mul0, acc2
ADCXQ acc4, acc3
ADCQ mul0, acc2
ADCQ acc4, acc3

MOVQ acc6, mul1
MULXQ mul1, mul0, acc4
ADCXQ mul0, t0
ADCXQ acc4, t1
ADCQ mul0, t0
ADCQ acc4, t1

MOVQ acc7, mul1
MULXQ mul1, mul0, acc4
ADCXQ mul0, t2
ADCXQ acc4, t3

// T = [t3, t2, t1, t0, acc3, acc2, acc1, acc0]
ADCQ mul0, t2
ADCQ acc4, t3

sm2P256SqrReductionInternal()

RET
Expand Down

0 comments on commit 76131e6

Please sign in to comment.