Skip to content

Commit 953e809

Browse files
sophie-zhaoabner-chenc
authored andcommitted
chacha20: add loong64 SIMD implementation
The performance of chacha20 has been greatly improved on 3A6000 and 3A5000. goos: linux goarch: loong64 pkg: golang.org/x/crypto/chacha20 cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | ChaCha20/64 171.9n ± 0% 159.3n ± 0% -7.33% (p=0.000 n=20) ChaCha20/256 592.2n ± 0% 142.8n ± 0% -75.89% (p=0.000 n=20) ChaCha20/10x25 981.5n ± 0% 518.8n ± 0% -47.14% (p=0.000 n=20) ChaCha20/4096 8.991µ ± 0% 1.732µ ± 0% -80.74% (p=0.000 n=20) ChaCha20/100x40 10.651µ ± 0% 5.135µ ± 0% -51.79% (p=0.000 n=20) ChaCha20/65536 143.43µ ± 0% 28.76µ ± 0% -79.95% (p=0.000 n=20) ChaCha20/1000x65 146.17µ ± 0% 37.13µ ± 0% -74.60% (p=0.000 n=20) geomean 5.721µ 1.962µ -65.70% | bench.old | bench.new | | B/s | B/s vs base | ChaCha20/64 355.1Mi ± 0% 383.1Mi ± 0% +7.89% (p=0.000 n=20) ChaCha20/256 412.2Mi ± 0% 1710.2Mi ± 0% +314.86% (p=0.000 n=20) ChaCha20/10x25 242.9Mi ± 0% 459.6Mi ± 0% +89.19% (p=0.000 n=20) ChaCha20/4096 434.5Mi ± 0% 2255.8Mi ± 0% +419.22% (p=0.000 n=20) ChaCha20/100x40 358.1Mi ± 0% 742.9Mi ± 0% +107.44% (p=0.000 n=20) ChaCha20/65536 435.8Mi ± 0% 2173.2Mi ± 0% +398.72% (p=0.000 n=20) ChaCha20/1000x65 424.1Mi ± 0% 1669.4Mi ± 0% +293.64% (p=0.000 n=20) geomean 373.9Mi 1.065Gi +191.55% goos: linux goarch: loong64 pkg: golang.org/x/crypto/chacha20 cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | ChaCha20/64 234.5n ± 0% 295.8n ± 0% +26.14% (p=0.000 n=20) ChaCha20/256 782.0n ± 0% 274.6n ± 0% -64.88% (p=0.000 n=20) ChaCha20/10x25 1340.0n ± 0% 752.7n ± 0% -43.83% (p=0.000 n=20) ChaCha20/4096 11.744µ ± 0% 3.455µ ± 0% -70.58% (p=0.000 n=20) ChaCha20/100x40 14.151µ ± 0% 7.435µ ± 0% -47.46% (p=0.000 n=20) ChaCha20/65536 188.05µ ± 0% 54.33µ ± 0% -71.11% (p=0.000 n=20) ChaCha20/1000x65 191.44µ ± 0% 66.29µ ± 0% -65.37% (p=0.000 n=20) geomean 7.604µ 3.436µ -54.81% | bench.old | bench.new | | B/s | B/s vs base | ChaCha20/64 260.3Mi ± 0% 206.3Mi ± 0% -20.73% (p=0.000 n=20) ChaCha20/256 312.2Mi ± 0% 888.9Mi ± 0% +184.75% (p=0.000 n=20) ChaCha20/10x25 177.9Mi ± 0% 316.8Mi ± 0% +78.08% (p=0.000 n=20) ChaCha20/4096 332.6Mi ± 0% 1130.8Mi ± 0% +239.95% (p=0.000 n=20) ChaCha20/100x40 269.6Mi ± 0% 513.1Mi ± 0% +90.34% (p=0.000 n=20) ChaCha20/65536 332.4Mi ± 0% 1150.5Mi ± 0% +246.16% (p=0.000 n=20) ChaCha20/1000x65 323.8Mi ± 0% 935.2Mi ± 0% +188.81% (p=0.000 n=20) geomean 281.3Mi 622.6Mi +121.31% Change-Id: I5386f2029122076c1d22a04610567e3df23877cd Reviewed-on: https://go-review.googlesource.com/c/crypto/+/636257 Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Carlos Amedee <carlos@golang.org>
1 parent 18f7707 commit 953e809

File tree

3 files changed

+397
-1
lines changed

3 files changed

+397
-1
lines changed

chacha20/chacha_loong64.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// Copyright 2025 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build gc && !purego
6+
7+
package chacha20
8+
9+
import "golang.org/x/sys/cpu"
10+
11+
const bufSize = 256
12+
13+
//go:noescape
14+
func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
15+
16+
func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) {
17+
if cpu.Loong64.HasLSX {
18+
xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter)
19+
} else {
20+
c.xorKeyStreamBlocksGeneric(dst, src)
21+
}
22+
}

chacha20/chacha_loong64.s

Lines changed: 374 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,374 @@
1+
// Copyright 2025 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
// derived from chacha_arm64.s
6+
7+
//go:build gc && !purego
8+
9+
#include "textflag.h"
10+
11+
DATA ·constants+0x00(SB)/4, $0x61707865
12+
DATA ·constants+0x04(SB)/4, $0x3320646e
13+
DATA ·constants+0x08(SB)/4, $0x79622d32
14+
DATA ·constants+0x0c(SB)/4, $0x6b206574
15+
GLOBL ·constants(SB), NOPTR|RODATA, $32
16+
17+
DATA ·incRotMatrix+0x00(SB)/4, $0x00000000
18+
DATA ·incRotMatrix+0x04(SB)/4, $0x00000001
19+
DATA ·incRotMatrix+0x08(SB)/4, $0x00000002
20+
DATA ·incRotMatrix+0x0c(SB)/4, $0x00000003
21+
GLOBL ·incRotMatrix(SB), NOPTR|RODATA, $32
22+
23+
#define NUM_ROUNDS 10
24+
25+
// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
26+
TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
27+
MOVV dst+0(FP), R4
28+
MOVV src+24(FP), R5
29+
MOVV src_len+32(FP), R6
30+
MOVV key+48(FP), R7
31+
MOVV nonce+56(FP), R8
32+
MOVV counter+64(FP), R9
33+
34+
MOVV $·constants(SB), R10
35+
MOVV $·incRotMatrix(SB), R11
36+
37+
MOVW (R9), R12
38+
39+
loop:
40+
MOVV $NUM_ROUNDS, R15
41+
// load 4-32bit data from incRotMatrix added to counter
42+
VMOVQ (R11), V30
43+
44+
// load contants
45+
// VLDREPL.W $0, R10, V0
46+
WORD $0x30200140
47+
// VLDREPL.W $1, R10, V1
48+
WORD $0x30200541
49+
// VLDREPL.W $2, R10, V2
50+
WORD $0x30200942
51+
// VLDREPL.W $3, R10, V3
52+
WORD $0x30200d43
53+
54+
// load keys
55+
// VLDREPL.W $0, R7, V4
56+
WORD $0x302000e4
57+
// VLDREPL.W $1, R7, V5
58+
WORD $0x302004e5
59+
// VLDREPL.W $2, R7, V6
60+
WORD $0x302008e6
61+
// VLDREPL.W $3, R7, V7
62+
WORD $0x30200ce7
63+
// VLDREPL.W $4, R7, V8
64+
WORD $0x302010e8
65+
// VLDREPL.W $5, R7, V9
66+
WORD $0x302014e9
67+
// VLDREPL.W $6, R7, V10
68+
WORD $0x302018ea
69+
// VLDREPL.W $7, R7, V11
70+
WORD $0x30201ceb
71+
72+
// load counter + nonce
73+
// VLDREPL.W $0, R9, V12
74+
WORD $0x3020012c
75+
76+
// VLDREPL.W $0, R8, V13
77+
WORD $0x3020010d
78+
// VLDREPL.W $1, R8, V14
79+
WORD $0x3020050e
80+
// VLDREPL.W $2, R8, V15
81+
WORD $0x3020090f
82+
83+
// update counter
84+
VADDW V30, V12, V12
85+
86+
chacha:
87+
// V0..V3 += V4..V7
88+
// V12..V15 <<<= ((V12..V15 XOR V0..V3), 16)
89+
VADDW V0, V4, V0
90+
VADDW V1, V5, V1
91+
VADDW V2, V6, V2
92+
VADDW V3, V7, V3
93+
VXORV V12, V0, V12
94+
VXORV V13, V1, V13
95+
VXORV V14, V2, V14
96+
VXORV V15, V3, V15
97+
VROTRW $16, V12, V12
98+
VROTRW $16, V13, V13
99+
VROTRW $16, V14, V14
100+
VROTRW $16, V15, V15
101+
102+
// V8..V11 += V12..V15
103+
// V4..V7 <<<= ((V4..V7 XOR V8..V11), 12)
104+
VADDW V8, V12, V8
105+
VADDW V9, V13, V9
106+
VADDW V10, V14, V10
107+
VADDW V11, V15, V11
108+
VXORV V4, V8, V4
109+
VXORV V5, V9, V5
110+
VXORV V6, V10, V6
111+
VXORV V7, V11, V7
112+
VROTRW $20, V4, V4
113+
VROTRW $20, V5, V5
114+
VROTRW $20, V6, V6
115+
VROTRW $20, V7, V7
116+
117+
// V0..V3 += V4..V7
118+
// V12..V15 <<<= ((V12..V15 XOR V0..V3), 8)
119+
VADDW V0, V4, V0
120+
VADDW V1, V5, V1
121+
VADDW V2, V6, V2
122+
VADDW V3, V7, V3
123+
VXORV V12, V0, V12
124+
VXORV V13, V1, V13
125+
VXORV V14, V2, V14
126+
VXORV V15, V3, V15
127+
VROTRW $24, V12, V12
128+
VROTRW $24, V13, V13
129+
VROTRW $24, V14, V14
130+
VROTRW $24, V15, V15
131+
132+
// V8..V11 += V12..V15
133+
// V4..V7 <<<= ((V4..V7 XOR V8..V11), 7)
134+
VADDW V12, V8, V8
135+
VADDW V13, V9, V9
136+
VADDW V14, V10, V10
137+
VADDW V15, V11, V11
138+
VXORV V4, V8, V4
139+
VXORV V5, V9, V5
140+
VXORV V6, V10, V6
141+
VXORV V7, V11, V7
142+
VROTRW $25, V4, V4
143+
VROTRW $25, V5, V5
144+
VROTRW $25, V6, V6
145+
VROTRW $25, V7, V7
146+
147+
// V0..V3 += V5..V7, V4
148+
// V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16)
149+
VADDW V0, V5, V0
150+
VADDW V1, V6, V1
151+
VADDW V2, V7, V2
152+
VADDW V3, V4, V3
153+
VXORV V15, V0, V15
154+
VXORV V12, V1, V12
155+
VXORV V13, V2, V13
156+
VXORV V14, V3, V14
157+
VROTRW $16, V15, V15
158+
VROTRW $16, V12, V12
159+
VROTRW $16, V13, V13
160+
VROTRW $16, V14, V14
161+
162+
// V10,V11,V8,V9 += V15,V12,V13,V14
163+
// V5,V6,V7,V4 <<<= ((V5,V6,V7,V4 XOR V10,V11,V8,V9), 12)
164+
VADDW V10, V15, V10
165+
VADDW V11, V12, V11
166+
VADDW V8, V13, V8
167+
VADDW V9, V14, V9
168+
VXORV V5, V10, V5
169+
VXORV V6, V11, V6
170+
VXORV V7, V8, V7
171+
VXORV V4, V9, V4
172+
VROTRW $20, V5, V5
173+
VROTRW $20, V6, V6
174+
VROTRW $20, V7, V7
175+
VROTRW $20, V4, V4
176+
177+
// V0..V3 += V5..V7, V4
178+
// V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 8)
179+
VADDW V5, V0, V0
180+
VADDW V6, V1, V1
181+
VADDW V7, V2, V2
182+
VADDW V4, V3, V3
183+
VXORV V15, V0, V15
184+
VXORV V12, V1, V12
185+
VXORV V13, V2, V13
186+
VXORV V14, V3, V14
187+
VROTRW $24, V15, V15
188+
VROTRW $24, V12, V12
189+
VROTRW $24, V13, V13
190+
VROTRW $24, V14, V14
191+
192+
// V10,V11,V8,V9 += V15,V12,V13,V14
193+
// V5,V6,V7,V4 <<<= ((V5,V6,V7,V4 XOR V10,V11,V8,V9), 7)
194+
VADDW V15, V10, V10
195+
VADDW V12, V11, V11
196+
VADDW V13, V8, V8
197+
VADDW V14, V9, V9
198+
VXORV V5, V10, V5
199+
VXORV V6, V11, V6
200+
VXORV V7, V8, V7
201+
VXORV V4, V9, V4
202+
VROTRW $25, V5, V5
203+
VROTRW $25, V6, V6
204+
VROTRW $25, V7, V7
205+
VROTRW $25, V4, V4
206+
207+
SUBV $1, R15
208+
BNE R15, R0, chacha
209+
210+
// load origin contants
211+
// VLDREPL.W $0, R10, V16
212+
WORD $0x30200150
213+
// VLDREPL.W $1, R10, V17
214+
WORD $0x30200551
215+
// VLDREPL.W $2, R10, V18
216+
WORD $0x30200952
217+
// VLDREPL.W $3, R10, V19
218+
WORD $0x30200d53
219+
220+
// load origin keys
221+
// VLDREPL.W $0, R7, V20
222+
WORD $0x302000f4
223+
// VLDREPL.W $1, R7, V21
224+
WORD $0x302004f5
225+
// VLDREPL.W $2, R7, V22
226+
WORD $0x302008f6
227+
// VLDREPL.W $3, R7, V23
228+
WORD $0x30200cf7
229+
// VLDREPL.W $4, R7, V24
230+
WORD $0x302010f8
231+
// VLDREPL.W $5, R7, V25
232+
WORD $0x302014f9
233+
// VLDREPL.W $6, R7, V26
234+
WORD $0x302018fa
235+
// VLDREPL.W $7, R7, V27
236+
WORD $0x30201cfb
237+
238+
// add back the initial state to generate the key stream
239+
VADDW V30, V12, V12 // update counter in advance to prevent V30 from being overwritten
240+
VADDW V16, V0, V0
241+
VADDW V17, V1, V1
242+
VADDW V18, V2, V2
243+
VADDW V19, V3, V3
244+
245+
// load origin counter + nonce
246+
// VLDREPL.W $0, R9, V28
247+
WORD $0x3020013c
248+
// VLDREPL.W $0, R8, V29
249+
WORD $0x3020011d
250+
// VLDREPL.W $1, R8, V30
251+
WORD $0x3020051e
252+
// VLDREPL.W $2, R8, V31
253+
WORD $0x3020091f
254+
255+
VADDW V20, V4, V4
256+
VADDW V21, V5, V5
257+
VADDW V22, V6, V6
258+
VADDW V23, V7, V7
259+
VADDW V24, V8, V8
260+
VADDW V25, V9, V9
261+
VADDW V26, V10, V10
262+
VADDW V27, V11, V11
263+
VADDW V28, V12, V12
264+
VADDW V29, V13, V13
265+
VADDW V30, V14, V14
266+
VADDW V31, V15, V15
267+
268+
// shuffle
269+
VILVLW V0, V1, V16
270+
VILVHW V0, V1, V17
271+
VILVLW V2, V3, V18
272+
VILVHW V2, V3, V19
273+
VILVLW V4, V5 ,V20
274+
VILVHW V4, V5, V21
275+
VILVLW V6, V7, V22
276+
VILVHW V6, V7, V23
277+
VILVLW V8, V9, V24
278+
VILVHW V8, V9, V25
279+
VILVLW V10, V11, V26
280+
VILVHW V10, V11, V27
281+
VILVLW V12, V13, V28
282+
VILVHW V12, V13, V29
283+
VILVLW V14, V15, V30
284+
VILVHW V14, V15, V31
285+
VILVLV V16, V18, V0
286+
VILVHV V16, V18, V4
287+
VILVLV V17, V19, V8
288+
VILVHV V17, V19, V12
289+
290+
// load src data from R5
291+
VMOVQ 0(R5), V16
292+
VMOVQ 16(R5), V17
293+
VMOVQ 32(R5), V18
294+
VMOVQ 48(R5), V19
295+
296+
VILVLV V20, V22, V1
297+
VILVHV V20, V22, V5
298+
VILVLV V21, V23, V9
299+
VILVHV V21, V23, V13
300+
301+
VMOVQ 64(R5), V20
302+
VMOVQ 80(R5), V21
303+
VMOVQ 96(R5), V22
304+
VMOVQ 112(R5), V23
305+
306+
VILVLV V24, V26, V2
307+
VILVHV V24, V26, V6
308+
VILVLV V25, V27, V10
309+
VILVHV V25, V27, V14
310+
311+
VMOVQ 128(R5), V24
312+
VMOVQ 144(R5), V25
313+
VMOVQ 160(R5), V26
314+
VMOVQ 176(R5), V27
315+
316+
VILVLV V28, V30, V3
317+
VILVHV V28, V30, V7
318+
VILVLV V29, V31, V11
319+
VILVHV V29, V31, V15
320+
321+
VMOVQ 192(R5), V28
322+
VMOVQ 208(R5), V29
323+
VMOVQ 224(R5), V30
324+
VMOVQ 240(R5), V31
325+
326+
VXORV V0, V16, V16
327+
VXORV V1, V17, V17
328+
VXORV V2, V18, V18
329+
VXORV V3, V19, V19
330+
331+
VMOVQ V16, 0(R4)
332+
VMOVQ V17, 16(R4)
333+
VMOVQ V18, 32(R4)
334+
VMOVQ V19, 48(R4)
335+
336+
VXORV V4, V20, V20
337+
VXORV V5, V21, V21
338+
VXORV V6, V22, V22
339+
VXORV V7, V23, V23
340+
341+
VMOVQ V20, 64(R4)
342+
VMOVQ V21, 80(R4)
343+
VMOVQ V22, 96(R4)
344+
VMOVQ V23, 112(R4)
345+
346+
VXORV V8, V24, V24
347+
VXORV V9, V25, V25
348+
VXORV V10, V26, V26
349+
VXORV V11, V27, V27
350+
351+
VMOVQ V24, 128(R4)
352+
VMOVQ V25, 144(R4)
353+
VMOVQ V26, 160(R4)
354+
VMOVQ V27, 176(R4)
355+
356+
VXORV V12, V28, V28
357+
VXORV V13, V29, V29
358+
VXORV V14, V30, V30
359+
VXORV V15, V31, V31
360+
361+
VMOVQ V28, 192(R4)
362+
VMOVQ V29, 208(R4)
363+
VMOVQ V30, 224(R4)
364+
VMOVQ V31, 240(R4)
365+
366+
ADD $4, R12, R12
367+
MOVW R12, (R9) // update counter
368+
369+
ADDV $256, R4, R4
370+
ADDV $256, R5, R5
371+
SUBV $256, R6, R6
372+
BNE R6, R0, loop
373+
374+
RET

0 commit comments

Comments
 (0)