Skip to content

Commit

Permalink
internal/sm2ec: amd64, optimize select SIMD
Browse files Browse the repository at this point in the history
  • Loading branch information
emmansun authored Mar 4, 2024
1 parent f7beee3 commit 48589f0
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 32 deletions.
14 changes: 4 additions & 10 deletions internal/sm2ec/p256_asm_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -483,8 +483,6 @@ internalSqrBMI2:
ST (yout) \
\// Load stored values from stack
MOVQ rptr, AX \
MOVL sel_save, BX \
MOVL zero_save, CX \

// func p256PointAddAffineAsm(res, in1 *SM2P256Point, in2 *p256AffinePoint, sign, sel, zero int)
TEXT ·p256PointAddAffineAsm(SB),0,$512-48
Expand Down Expand Up @@ -528,8 +526,8 @@ TEXT ·p256PointAddAffineAsm(SB),0,$512-48
MOVOU zout(16*0), X4
MOVOU zout(16*1), X5

MOVL BX, X6 // sel
MOVL CX, X7 // zero
MOVL sel_save, X6 // sel
MOVL zero_save, X7 // zero

PXOR X8, X8 // X8's bits are all 0
PCMPEQL X9, X9 // X9's bits are all 1
Expand Down Expand Up @@ -626,13 +624,9 @@ pointaddaffine_avx2:

p256PointAddAffineInline()
// The result is not valid if (sel == 0), conditional choose
MOVL BX, X6 // sel
MOVL CX, X7 // zero

VPXOR Y8, Y8, Y8 // Y8's bits are all 0

VPBROADCASTD X6, Y6
VPBROADCASTD X7, Y7
VPBROADCASTD sel_save, Y6 // sel
VPBROADCASTD zero_save, Y7 // zero

VPCMPEQD Y8, Y6, Y6 // Y6's bits are all 1 if sel = 0, else are 0
VPCMPEQD Y8, Y7, Y7 // Y7's bits are all 1 if zero = 0, else are 0
Expand Down
19 changes: 7 additions & 12 deletions internal/sm2ec/p256_common_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -705,9 +705,8 @@ loop_select:
select_avx2:
VPXOR Y15, Y15, Y15
VPCMPEQD Y14, Y14, Y14
VPSUBD Y14, Y15, Y15
MOVL idx+16(FP), X14 // x14 = idx
VPBROADCASTD X14, Y14
VPSUBD Y14, Y15, Y15 // Y15 = 1
VPBROADCASTD idx+16(FP), Y14

MOVQ limit+24(FP),AX
VMOVDQU Y15, Y13
Expand All @@ -717,9 +716,8 @@ select_avx2:
VPXOR Y2, Y2, Y2

loop_select_avx2:
VMOVDQU Y13, Y12
VPCMPEQD Y14, Y13, Y12
VPADDD Y15, Y13, Y13
VPCMPEQD Y14, Y12, Y12

VPAND (32*0)(DI), Y12, Y3
VPAND (32*1)(DI), Y12, Y4
Expand Down Expand Up @@ -753,7 +751,7 @@ TEXT ·p256SelectAffine(SB),NOSPLIT,$0
PXOR X15, X15 // X15 = 0
PCMPEQL X14, X14 // X14 = -1
PSUBL X14, X15 // X15 = 1
MOVL AX, X14 // x14 = idx
MOVL idx+16(FP), X14 // x14 = idx
PSHUFD $0, X14, X14

MOVQ $16, AX
Expand Down Expand Up @@ -820,25 +818,22 @@ select_base_avx2:
VPXOR Y15, Y15, Y15
VPCMPEQD Y14, Y14, Y14
VPSUBD Y14, Y15, Y15
MOVL AX, X14 // x14 = idx
VPBROADCASTD X14, Y14
VPBROADCASTD idx+16(FP), Y14

MOVQ $16, AX
VMOVDQU Y15, Y13
VPXOR Y0, Y0, Y0
VPXOR Y1, Y1, Y1

loop_select_base_avx2:
VMOVDQU Y13, Y12
VPCMPEQD Y14, Y13, Y12
VPADDD Y15, Y13, Y13
VPCMPEQD Y14, Y12, Y12

VPAND (32*0)(DI), Y12, Y2
VPAND (32*1)(DI), Y12, Y3

VMOVDQU Y13, Y12
VPCMPEQD Y14, Y13, Y12
VPADDD Y15, Y13, Y13
VPCMPEQD Y14, Y12, Y12

VPAND (32*2)(DI), Y12, Y4
VPAND (32*3)(DI), Y12, Y5
Expand Down
14 changes: 4 additions & 10 deletions internal/sm2ec/p256_plugin_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -500,8 +500,6 @@ internalSqrBMI2:
ST (yout) \
\// Load stored values from stack
MOVQ rptr, AX \
MOVL sel_save, BX \
MOVL zero_save, CX \

// func p256PointAddAffineAsm(res, in1 *SM2P256Point, in2 *p256AffinePoint, sign, sel, zero int)
TEXT ·p256PointAddAffineAsm(SB),0,$512-48
Expand Down Expand Up @@ -545,8 +543,8 @@ TEXT ·p256PointAddAffineAsm(SB),0,$512-48
MOVOU zout(16*0), X4
MOVOU zout(16*1), X5

MOVL BX, X6 // sel
MOVL CX, X7 // zero
MOVL sel_save, X6 // sel
MOVL zero_save, X7 // zero

PXOR X8, X8 // X8's bits are all 0
PCMPEQL X9, X9 // X9's bits are all 1
Expand Down Expand Up @@ -643,13 +641,9 @@ pointaddaffine_avx2:

p256PointAddAffineInline()
// The result is not valid if (sel == 0), conditional choose
MOVL BX, X6 // sel
MOVL CX, X7 // zero

VPXOR Y8, Y8, Y8 // Y8's bits are all 0

VPBROADCASTD X6, Y6
VPBROADCASTD X7, Y7
VPBROADCASTD sel_save, Y6 // sel
VPBROADCASTD zero_save, Y7 // zero

VPCMPEQD Y8, Y6, Y6 // Y6's bits are all 1 if sel = 0, else are 0
VPCMPEQD Y8, Y7, Y7 // Y7's bits are all 1 if zero = 0, else are 0
Expand Down

0 comments on commit 48589f0

Please sign in to comment.