Skip to content

Commit

Permalink
x/crypto/chacha20: cleanup chacha_ppc64le.s
Browse files Browse the repository at this point in the history
- Adding PCALIGN before the loops
- Changing WORD directive with corresponding Vector Merge EVEN/ODD word  instructions
- Replacing Branch Conditional (BC) with its extended mnemonic form  BDNZ
- VPERMXOR instruction usage in place of VXOR instructions followed by
  VRLW (rotate left) for cases of rotating in multiples of 8. This
  replacements give performace improvement both in time and space  of around 7%-8% as listed below
  using benchstat tool.

goos: linux
goarch: ppc64le
pkg: golang.org/x/crypto/chacha20
cpu: POWER10
                 | chacha20.prev.out |       chacha20.new.out            |
                 |      sec/op       |   sec/op     vs base              |
ChaCha20/64              171.9n ± 0%   156.6n ± 1%  -8.90% (p=0.002 n=6)
ChaCha20/256             165.5n ± 0%   152.4n ± 0%  -7.92% (p=0.002 n=6)
ChaCha20/10x25           505.8n ± 0%   504.3n ± 2%  -0.32% (p=0.589 n=6)
ChaCha20/4096            2.265µ ± 0%   2.052µ ± 0%  -9.40% (p=0.002 n=6)
ChaCha20/100x40          5.359µ ± 3%   5.018µ ± 2%  -6.37% (p=0.002 n=6)
ChaCha20/65536           35.71µ ± 0%   32.29µ ± 0%  -9.57% (p=0.002 n=6)
ChaCha20/1000x65         44.63µ ± 0%   41.05µ ± 0%  -8.02% (p=0.002 n=6)
geomean                  2.235µ        2.073µ       -7.26%

                 | chacha20.prev.out |          chacha20.new.out         |
                 |       B/s         |     B/s       vs base             |
ChaCha20/64             355.1Mi ± 0%   389.8Mi ± 1%   +9.78% (p=0.002 n=6)
ChaCha20/256            1.440Gi ± 0%   1.565Gi ± 0%   +8.62% (p=0.002 n=6)
ChaCha20/10x25          471.3Mi ± 0%   472.8Mi ± 2%   +0.31% (p=0.589 n=6)
ChaCha20/4096           1.684Gi ± 0%   1.859Gi ± 0%  +10.38% (p=0.002 n=6)
ChaCha20/100x40         711.8Mi ± 3%   760.3Mi ± 2%   +6.80% (p=0.002 n=6)
ChaCha20/65536          1.709Gi ± 0%   1.890Gi ± 0%  +10.59% (p=0.002 n=6)
ChaCha20/1000x65        1.356Gi ± 0%   1.475Gi ± 0%   +8.72% (p=0.002 n=6)
geomean                 957.3Mi        1.008Gi        +7.83%
Change-Id: Ib31cb10a2a11eacdacf0272fbfd887eb5ccd8bcb
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/564797
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Run-TryBot: Paul Murphy <murp@ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
  • Loading branch information
jkrishmys authored and laboger committed Mar 26, 2024
1 parent b91329d commit 8d0d405
Showing 1 changed file with 52 additions and 58 deletions.
110 changes: 52 additions & 58 deletions chacha20/chacha_ppc64le.s
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@
#define CONSTBASE R16
#define BLOCKS R17

// for VPERMXOR
#define MASK R18

DATA consts<>+0x00(SB)/8, $0x3320646e61707865
DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
DATA consts<>+0x10(SB)/8, $0x0000000000000001
Expand All @@ -53,7 +56,11 @@ DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
DATA consts<>+0x90(SB)/8, $0x0000000100000000
DATA consts<>+0x98(SB)/8, $0x0000000300000002
GLOBL consts<>(SB), RODATA, $0xa0
DATA consts<>+0xa0(SB)/8, $0x5566774411223300
DATA consts<>+0xa8(SB)/8, $0xddeeffcc99aabb88
DATA consts<>+0xb0(SB)/8, $0x6677445522330011
DATA consts<>+0xb8(SB)/8, $0xeeffccddaabb8899
GLOBL consts<>(SB), RODATA, $0xc0

//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
Expand All @@ -70,6 +77,9 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
MOVD $48, R10
MOVD $64, R11
SRD $6, LEN, BLOCKS
// for VPERMXOR
MOVD $consts<>+0xa0(SB), MASK
MOVD $16, R20
// V16
LXVW4X (CONSTBASE)(R0), VS48
ADD $80,CONSTBASE
Expand All @@ -87,6 +97,10 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
// V28
LXVW4X (CONSTBASE)(R11), VS60

// Load mask constants for VPERMXOR
LXVW4X (MASK)(R0), V20
LXVW4X (MASK)(R20), V21

// splat slot from V19 -> V26
VSPLTW $0, V19, V26

Expand All @@ -97,7 +111,7 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40

MOVD $10, R14
MOVD R14, CTR

PCALIGN $16
loop_outer_vsx:
// V0, V1, V2, V3
LXVW4X (R0)(CONSTBASE), VS32
Expand Down Expand Up @@ -128,22 +142,17 @@ loop_outer_vsx:
VSPLTISW $12, V28
VSPLTISW $8, V29
VSPLTISW $7, V30

PCALIGN $16
loop_vsx:
VADDUWM V0, V4, V0
VADDUWM V1, V5, V1
VADDUWM V2, V6, V2
VADDUWM V3, V7, V3

VXOR V12, V0, V12
VXOR V13, V1, V13
VXOR V14, V2, V14
VXOR V15, V3, V15

VRLW V12, V27, V12
VRLW V13, V27, V13
VRLW V14, V27, V14
VRLW V15, V27, V15
VPERMXOR V12, V0, V21, V12
VPERMXOR V13, V1, V21, V13
VPERMXOR V14, V2, V21, V14
VPERMXOR V15, V3, V21, V15

VADDUWM V8, V12, V8
VADDUWM V9, V13, V9
Expand All @@ -165,15 +174,10 @@ loop_vsx:
VADDUWM V2, V6, V2
VADDUWM V3, V7, V3

VXOR V12, V0, V12
VXOR V13, V1, V13
VXOR V14, V2, V14
VXOR V15, V3, V15

VRLW V12, V29, V12
VRLW V13, V29, V13
VRLW V14, V29, V14
VRLW V15, V29, V15
VPERMXOR V12, V0, V20, V12
VPERMXOR V13, V1, V20, V13
VPERMXOR V14, V2, V20, V14
VPERMXOR V15, V3, V20, V15

VADDUWM V8, V12, V8
VADDUWM V9, V13, V9
Expand All @@ -195,15 +199,10 @@ loop_vsx:
VADDUWM V2, V7, V2
VADDUWM V3, V4, V3

VXOR V15, V0, V15
VXOR V12, V1, V12
VXOR V13, V2, V13
VXOR V14, V3, V14

VRLW V15, V27, V15
VRLW V12, V27, V12
VRLW V13, V27, V13
VRLW V14, V27, V14
VPERMXOR V15, V0, V21, V15
VPERMXOR V12, V1, V21, V12
VPERMXOR V13, V2, V21, V13
VPERMXOR V14, V3, V21, V14

VADDUWM V10, V15, V10
VADDUWM V11, V12, V11
Expand All @@ -225,15 +224,10 @@ loop_vsx:
VADDUWM V2, V7, V2
VADDUWM V3, V4, V3

VXOR V15, V0, V15
VXOR V12, V1, V12
VXOR V13, V2, V13
VXOR V14, V3, V14

VRLW V15, V29, V15
VRLW V12, V29, V12
VRLW V13, V29, V13
VRLW V14, V29, V14
VPERMXOR V15, V0, V20, V15
VPERMXOR V12, V1, V20, V12
VPERMXOR V13, V2, V20, V13
VPERMXOR V14, V3, V20, V14

VADDUWM V10, V15, V10
VADDUWM V11, V12, V11
Expand All @@ -249,48 +243,48 @@ loop_vsx:
VRLW V6, V30, V6
VRLW V7, V30, V7
VRLW V4, V30, V4
BC 16, LT, loop_vsx
BDNZ loop_vsx

VADDUWM V12, V26, V12

WORD $0x13600F8C // VMRGEW V0, V1, V27
WORD $0x13821F8C // VMRGEW V2, V3, V28
VMRGEW V0, V1, V27
VMRGEW V2, V3, V28

WORD $0x10000E8C // VMRGOW V0, V1, V0
WORD $0x10421E8C // VMRGOW V2, V3, V2
VMRGOW V0, V1, V0
VMRGOW V2, V3, V2

WORD $0x13A42F8C // VMRGEW V4, V5, V29
WORD $0x13C63F8C // VMRGEW V6, V7, V30
VMRGEW V4, V5, V29
VMRGEW V6, V7, V30

XXPERMDI VS32, VS34, $0, VS33
XXPERMDI VS32, VS34, $3, VS35
XXPERMDI VS59, VS60, $0, VS32
XXPERMDI VS59, VS60, $3, VS34

WORD $0x10842E8C // VMRGOW V4, V5, V4
WORD $0x10C63E8C // VMRGOW V6, V7, V6
VMRGOW V4, V5, V4
VMRGOW V6, V7, V6

WORD $0x13684F8C // VMRGEW V8, V9, V27
WORD $0x138A5F8C // VMRGEW V10, V11, V28
VMRGEW V8, V9, V27
VMRGEW V10, V11, V28

XXPERMDI VS36, VS38, $0, VS37
XXPERMDI VS36, VS38, $3, VS39
XXPERMDI VS61, VS62, $0, VS36
XXPERMDI VS61, VS62, $3, VS38

WORD $0x11084E8C // VMRGOW V8, V9, V8
WORD $0x114A5E8C // VMRGOW V10, V11, V10
VMRGOW V8, V9, V8
VMRGOW V10, V11, V10

WORD $0x13AC6F8C // VMRGEW V12, V13, V29
WORD $0x13CE7F8C // VMRGEW V14, V15, V30
VMRGEW V12, V13, V29
VMRGEW V14, V15, V30

XXPERMDI VS40, VS42, $0, VS41
XXPERMDI VS40, VS42, $3, VS43
XXPERMDI VS59, VS60, $0, VS40
XXPERMDI VS59, VS60, $3, VS42

WORD $0x118C6E8C // VMRGOW V12, V13, V12
WORD $0x11CE7E8C // VMRGOW V14, V15, V14
VMRGOW V12, V13, V12
VMRGOW V14, V15, V14

VSPLTISW $4, V27
VADDUWM V26, V27, V26
Expand Down Expand Up @@ -431,15 +425,15 @@ tail_vsx:
ADD $-1, R11, R12
ADD $-1, INP
ADD $-1, OUT

PCALIGN $16
looptail_vsx:
// Copying the result to OUT
// in bytes.
MOVBZU 1(R12), KEY
MOVBZU 1(INP), TMP
XOR KEY, TMP, KEY
MOVBU KEY, 1(OUT)
BC 16, LT, looptail_vsx
BDNZ looptail_vsx

// Clear the stack values
STXVW4X VS48, (R11)(R0)
Expand Down

0 comments on commit 8d0d405

Please sign in to comment.