Skip to content

Commit

Permalink
internal/sm2ec: mul WWMM reduction, sub first
Browse files Browse the repository at this point in the history
  • Loading branch information
emmansun authored Feb 22, 2024
1 parent 2553456 commit 052040f
Show file tree
Hide file tree
Showing 5 changed files with 206 additions and 361 deletions.
125 changes: 4 additions & 121 deletions internal/sm2ec/p256_asm_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ ordSqrLoop:
ADCQ DX, acc2
ADCQ $0, acc3
ADCQ $0, acc0
// calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
MOVQ t0, AX
MOVQ t0, DX
SHLQ $32, AX
Expand Down Expand Up @@ -484,7 +484,7 @@ ordSqrLoopBMI2:
ADCQ t1, acc2 // (carry4, acc2) = acc2 + t1 + carry3
ADCQ $0, acc3 // (carry5, acc3) = acc3 + carry4
ADCQ $0, acc0 // acc0 = t0 + carry5
// calculate the positive part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
MOVQ t0, AX
//MOVQ t0, DX // This is not required due to t0=DX already
SHLQ $32, AX
Expand Down Expand Up @@ -759,66 +759,8 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$8
ADDQ mul0, acc6
ADCQ $0, mul1
MOVQ mul1, acc7
// First reduction step
MOVQ acc0, mul0
MOVQ acc0, mul1
SHLQ $32, mul0
SHRQ $32, mul1

ADDQ acc0, acc1
ADCQ $0, acc2
ADCQ $0, acc3
ADCQ $0, acc0

SUBQ mul0, acc1
SBBQ mul1, acc2
SBBQ mul0, acc3
SBBQ mul1, acc0
// Second reduction step
MOVQ acc1, mul0
MOVQ acc1, mul1
SHLQ $32, mul0
SHRQ $32, mul1

ADDQ acc1, acc2
ADCQ $0, acc3
ADCQ $0, acc0
ADCQ $0, acc1
sm2P256MulReductionInternal()

SUBQ mul0, acc2
SBBQ mul1, acc3
SBBQ mul0, acc0
SBBQ mul1, acc1
// Third reduction step
MOVQ acc2, mul0
MOVQ acc2, mul1
SHLQ $32, mul0
SHRQ $32, mul1

ADDQ acc2, acc3
ADCQ $0, acc0
ADCQ $0, acc1
ADCQ $0, acc2

SUBQ mul0, acc3
SBBQ mul1, acc0
SBBQ mul0, acc1
SBBQ mul1, acc2
// Last reduction step
MOVQ acc3, mul0
MOVQ acc3, mul1
SHLQ $32, mul0
SHRQ $32, mul1

ADDQ acc3, acc0
ADCQ $0, acc1
ADCQ $0, acc2
ADCQ $0, acc3

SUBQ mul0, acc0
SBBQ mul1, acc1
SBBQ mul0, acc2
SBBQ mul1, acc3
MOVQ $0, BP
// Add bits [511:256] of the result
ADCQ acc0, acc4
Expand Down Expand Up @@ -918,66 +860,7 @@ internalMulBMI2:
ADDQ mul0, acc6
ADCQ $0, acc7

// First reduction step
MOVQ acc0, mul0
MOVQ acc0, mul1
SHLQ $32, mul0
SHRQ $32, mul1

ADDQ acc0, acc1
ADCQ $0, acc2
ADCQ $0, acc3
ADCQ $0, acc0

SUBQ mul0, acc1
SBBQ mul1, acc2
SBBQ mul0, acc3
SBBQ mul1, acc0
// Second reduction step
MOVQ acc1, mul0
MOVQ acc1, mul1
SHLQ $32, mul0
SHRQ $32, mul1

ADDQ acc1, acc2
ADCQ $0, acc3
ADCQ $0, acc0
ADCQ $0, acc1

SUBQ mul0, acc2
SBBQ mul1, acc3
SBBQ mul0, acc0
SBBQ mul1, acc1
// Third reduction step
MOVQ acc2, mul0
MOVQ acc2, mul1
SHLQ $32, mul0
SHRQ $32, mul1

ADDQ acc2, acc3
ADCQ $0, acc0
ADCQ $0, acc1
ADCQ $0, acc2

SUBQ mul0, acc3
SBBQ mul1, acc0
SBBQ mul0, acc1
SBBQ mul1, acc2
// Last reduction step
MOVQ acc3, mul0
MOVQ acc3, mul1
SHLQ $32, mul0
SHRQ $32, mul1

ADDQ acc3, acc0
ADCQ $0, acc1
ADCQ $0, acc2
ADCQ $0, acc3

SUBQ mul0, acc0
SBBQ mul1, acc1
SBBQ mul0, acc2
SBBQ mul1, acc3
sm2P256MulReductionInternal()
MOVQ $0, BP
// Add bits [511:256] of the result
ADCQ acc0, acc4
Expand Down
93 changes: 48 additions & 45 deletions internal/sm2ec/p256_asm_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -207,54 +207,57 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0
LSL $32, acc0, y0
LSR $32, acc0, y1

SUBS y0, acc1
SBCS y1, acc2
SBCS y0, acc3
SBC y1, acc0, y0

ADDS acc0, acc1, acc1
ADCS $0, acc2, acc2
ADCS $0, acc3, acc3
ADC $0, acc0, acc0
ADC $0, y0, acc0

SUBS y0, acc1
SBCS y1, acc2
SBCS y0, acc3
SBC y1, acc0
// Second reduction step
LSL $32, acc1, y0
LSR $32, acc1, y1

SUBS y0, acc2
SBCS y1, acc3
SBCS y0, acc0
SBC y1, acc1, y0

ADDS acc1, acc2, acc2
ADCS $0, acc3, acc3
ADCS $0, acc0, acc0
ADC $0, acc1, acc1
ADC $0, y0, acc1

SUBS y0, acc2
SBCS y1, acc3
SBCS y0, acc0
SBC y1, acc1
// Third reduction step
LSL $32, acc2, y0
LSR $32, acc2, y1

SUBS y0, acc3
SBCS y1, acc0
SBCS y0, acc1
SBC y1, acc2, y0

ADDS acc2, acc3, acc3
ADCS $0, acc0, acc0
ADCS $0, acc1, acc1
ADC $0, acc2, acc2
ADC $0, y0, acc2

SUBS y0, acc3
SBCS y1, acc0
SBCS y0, acc1
SBC y1, acc2
// Last reduction step
LSL $32, acc3, y0
LSR $32, acc3, y1

ADDS acc3, acc0, acc0
ADCS $0, acc1, acc1
ADCS $0, acc2, acc2
ADC $0, acc3, acc3

SUBS y0, acc0
SBCS y1, acc1
SBCS y0, acc2
SBC y1, acc3
SBC y1, acc3, y0

ADDS acc3, acc0, acc0
ADCS $0, acc1, acc1
ADCS $0, acc2, acc2
ADC $0, y0, acc3

SUBS const0, acc0, t0
SBCS const1, acc1, t1
Expand Down Expand Up @@ -967,15 +970,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
LSL $32, acc0, t0
LSR $32, acc0, t1

ADDS acc0, acc1, acc1
ADCS $0, acc2, acc2
ADCS $0, acc3, acc3
ADC $0, acc0, acc0

SUBS t0, acc1
SBCS t1, acc2
SBCS t0, acc3
SBC t1, acc0
SBC t1, acc0, t0

ADDS acc0, acc1, acc1
ADCS $0, acc2, acc2
ADCS $0, acc3, acc3
ADC $0, t0, acc0

// y[1] * x
MUL y1, x0, t0
Expand Down Expand Up @@ -1003,15 +1006,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
LSL $32, acc1, t0
LSR $32, acc1, t1

ADDS acc1, acc2, acc2
ADCS $0, acc3, acc3
ADCS $0, acc0, acc0
ADC $0, acc1, acc1

SUBS t0, acc2
SBCS t1, acc3
SBCS t0, acc0
SBC t1, acc1
SBC t1, acc1, t0

ADDS acc1, acc2, acc2
ADCS $0, acc3, acc3
ADCS $0, acc0, acc0
ADC $0, t0, acc1

// y[2] * x
MUL y2, x0, t0
Expand Down Expand Up @@ -1039,15 +1042,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
LSL $32, acc2, t0
LSR $32, acc2, t1

ADDS acc2, acc3, acc3
ADCS $0, acc0, acc0
ADCS $0, acc1, acc1
ADC $0, acc2, acc2

SUBS t0, acc3
SBCS t1, acc0
SBCS t0, acc1
SBC t1, acc2
SBC t1, acc2, t0

ADDS acc2, acc3, acc3
ADCS $0, acc0, acc0
ADCS $0, acc1, acc1
ADC $0, t0, acc2

// y[3] * x
MUL y3, x0, t0
Expand Down Expand Up @@ -1075,15 +1078,15 @@ TEXT sm2P256MulInternal<>(SB),NOSPLIT,$0
LSL $32, acc3, t0
LSR $32, acc3, t1

ADDS acc3, acc0, acc0
ADCS $0, acc1, acc1
ADCS $0, acc2, acc2
ADC $0, acc3, acc3

SUBS t0, acc0
SBCS t1, acc1
SBCS t0, acc2
SBC t1, acc3
SBC t1, acc3, t0

ADDS acc3, acc0, acc0
ADCS $0, acc1, acc1
ADCS $0, acc2, acc2
ADC $0, t0, acc3

// Add bits [511:256] of the mul result
ADDS acc4, acc0, acc0
Expand Down
Loading

0 comments on commit 052040f

Please sign in to comment.