Skip to content

Commit 388684e

Browse files
sophie-zhaoabner-chenc
authored andcommitted
argon2: add loong64 SIMD implementation
The performance gains on Loongson 3A6000 and 3A5000 are as follows: goos: linux goarch: loong64 pkg: golang.org/x/crypto/argon2 cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Argon2i/_Time:_3_Memory:_32_MB,_Threads:_1 131.23m ± 0% 67.56m ± 1% -48.52% (p=0.000 n=10) Argon2i/_Time:_4_Memory:_32_MB,_Threads:_1 171.28m ± 2% 90.20m ± 0% -47.34% (p=0.000 n=10) Argon2i/_Time:_5_Memory:_32_MB,_Threads:_1 213.3m ± 0% 112.6m ± 0% -47.21% (p=0.000 n=10) Argon2i/_Time:_3_Memory:_64_MB,_Threads:_4 269.5m ± 0% 147.2m ± 0% -45.37% (p=0.000 n=10) Argon2i/_Time:_4_Memory:_64_MB,_Threads:_4 357.7m ± 0% 195.4m ± 0% -45.36% (p=0.000 n=10) Argon2i/_Time:_5_Memory:_64_MB,_Threads:_4 449.8m ± 0% 243.8m ± 0% -45.79% (p=0.000 n=10) Argon2d/_Time:_3,_Memory:_32_MB,_Threads:_1 126.56m ± 0% 67.43m ± 0% -46.72% (p=0.000 n=10) Argon2d/_Time:_4,_Memory:_32_MB,_Threads:_1 168.57m ± 0% 90.04m ± 0% -46.58% (p=0.000 n=10) Argon2d/_Time:_5,_Memory:_32_MB,_Threads:_1 210.5m ± 0% 112.7m ± 0% -46.45% (p=0.000 n=10) Argon2d/_Time:_3,_Memory:_64_MB,_Threads:_4 264.8m ± 0% 145.0m ± 1% -45.23% (p=0.000 n=10) Argon2d/_Time:_4,_Memory:_64_MB,_Threads:_4 353.8m ± 0% 193.7m ± 0% -45.26% (p=0.000 n=10) Argon2d/_Time:_5,_Memory:_64_MB,_Threads:_4 444.4m ± 0% 242.3m ± 0% -45.49% (p=0.000 n=10) Argon2id/_Time:_3,_Memory:_32_MB,_Threads:_1 126.89m ± 0% 66.62m ± 0% -47.50% (p=0.000 n=10) Argon2id/_Time:_4,_Memory:_32_MB,_Threads:_1 169.02m ± 0% 89.07m ± 0% -47.30% (p=0.000 n=10) Argon2id/_Time:_5,_Memory:_32_MB,_Threads:_1 210.7m ± 0% 111.0m ± 0% -47.34% (p=0.000 n=10) Argon2id/_Time:_3,_Memory:_64_MB,_Threads:_4 267.6m ± 1% 145.8m ± 0% -45.51% (p=0.000 n=10) Argon2id/_Time:_4,_Memory:_64_MB,_Threads:_4 355.1m ± 0% 194.1m ± 0% -45.34% (p=0.000 n=10) Argon2id/_Time:_5,_Memory:_64_MB,_Threads:_4 443.6m ± 0% 242.5m ± 0% -45.33% (p=0.000 n=10) geomean 240.8m 129.3m -46.32% goos: linux goarch: loong64 pkg: golang.org/x/crypto/argon2 cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Argon2i/_Time:_3_Memory:_32_MB,_Threads:_1 209.9m ± 1% 109.7m ± 2% -47.75% (p=0.000 n=10) Argon2i/_Time:_4_Memory:_32_MB,_Threads:_1 278.1m ± 0% 143.7m ± 0% -48.34% (p=0.000 n=10) Argon2i/_Time:_5_Memory:_32_MB,_Threads:_1 346.7m ± 0% 178.1m ± 0% -48.63% (p=0.000 n=10) Argon2i/_Time:_3_Memory:_64_MB,_Threads:_4 455.3m ± 0% 240.8m ± 0% -47.12% (p=0.000 n=10) Argon2i/_Time:_4_Memory:_64_MB,_Threads:_4 604.6m ± 0% 317.7m ± 0% -47.45% (p=0.000 n=10) Argon2i/_Time:_5_Memory:_64_MB,_Threads:_4 754.8m ± 0% 395.4m ± 0% -47.61% (p=0.000 n=10) Argon2d/_Time:_3,_Memory:_32_MB,_Threads:_1 206.9m ± 1% 107.6m ± 0% -48.00% (p=0.000 n=10) Argon2d/_Time:_4,_Memory:_32_MB,_Threads:_1 274.3m ± 0% 141.8m ± 1% -48.32% (p=0.000 n=10) Argon2d/_Time:_5,_Memory:_32_MB,_Threads:_1 342.4m ± 0% 175.6m ± 0% -48.71% (p=0.000 n=10) Argon2d/_Time:_3,_Memory:_64_MB,_Threads:_4 450.2m ± 0% 237.9m ± 0% -47.15% (p=0.000 n=10) Argon2d/_Time:_4,_Memory:_64_MB,_Threads:_4 597.7m ± 0% 314.0m ± 0% -47.46% (p=0.000 n=10) Argon2d/_Time:_5,_Memory:_64_MB,_Threads:_4 745.8m ± 0% 390.7m ± 1% -47.61% (p=0.000 n=10) Argon2id/_Time:_3,_Memory:_32_MB,_Threads:_1 207.6m ± 0% 107.9m ± 0% -48.05% (p=0.000 n=10) Argon2id/_Time:_4,_Memory:_32_MB,_Threads:_1 275.0m ± 0% 142.0m ± 0% -48.34% (p=0.000 n=10) Argon2id/_Time:_5,_Memory:_32_MB,_Threads:_1 342.9m ± 1% 176.0m ± 0% -48.66% (p=0.000 n=10) Argon2id/_Time:_3,_Memory:_64_MB,_Threads:_4 450.6m ± 1% 238.5m ± 0% -47.07% (p=0.000 n=10) Argon2id/_Time:_4,_Memory:_64_MB,_Threads:_4 598.5m ± 1% 314.6m ± 0% -47.44% (p=0.000 n=10) Argon2id/_Time:_5,_Memory:_64_MB,_Threads:_4 746.4m ± 0% 391.0m ± 0% -47.61% (p=0.000 n=10) geomean 398.6m 207.9m -47.86% Change-Id: Iaa9d134d68dd2f0972fc5768d7e66f7b1ff0ebd3 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/657795 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Carlos Amedee <carlos@golang.org>
1 parent 953e809 commit 388684e

File tree

3 files changed

+318
-1
lines changed

3 files changed

+318
-1
lines changed

argon2/blamka_loong64.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
// Copyright 2025 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build loong64 && gc && !purego
6+
7+
package argon2
8+
9+
import "golang.org/x/sys/cpu"
10+
11+
//go:noescape
12+
func mixBlocks1VX(out, in1, in2 *block)
13+
14+
//go:noescape
15+
func mixBlocks2VX(out, in1, in2, t *block)
16+
17+
//go:noescape
18+
func xorBlocksVX(out, in1, in2, t *block)
19+
20+
//go:noescape
21+
func blamkaVX(b *block)
22+
23+
func processBlockVX(out, in1, in2 *block, xor bool) {
24+
var t block
25+
mixBlocks1VX(&t, in1, in2)
26+
if cpu.Loong64.HasLSX {
27+
blamkaVX(&t)
28+
} else {
29+
for i := 0; i < blockLength; i += 16 {
30+
blamkaGeneric(
31+
&t[i+0], &t[i+1], &t[i+2], &t[i+3],
32+
&t[i+4], &t[i+5], &t[i+6], &t[i+7],
33+
&t[i+8], &t[i+9], &t[i+10], &t[i+11],
34+
&t[i+12], &t[i+13], &t[i+14], &t[i+15],
35+
)
36+
}
37+
for i := 0; i < blockLength/8; i += 2 {
38+
blamkaGeneric(
39+
&t[i], &t[i+1], &t[16+i], &t[16+i+1],
40+
&t[32+i], &t[32+i+1], &t[48+i], &t[48+i+1],
41+
&t[64+i], &t[64+i+1], &t[80+i], &t[80+i+1],
42+
&t[96+i], &t[96+i+1], &t[112+i], &t[112+i+1],
43+
)
44+
}
45+
}
46+
if xor {
47+
xorBlocksVX(out, in1, in2, &t)
48+
} else {
49+
mixBlocks2VX(out, in1, in2, &t)
50+
}
51+
}
52+
53+
func processBlock(out, in1, in2 *block) {
54+
processBlockVX(out, in1, in2, false)
55+
}
56+
57+
func processBlockXOR(out, in1, in2 *block) {
58+
processBlockVX(out, in1, in2, true)
59+
}

argon2/blamka_loong64.s

Lines changed: 258 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,258 @@
1+
// Copyright 2025 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build loong64 && gc && !purego
6+
7+
#include "textflag.h"
8+
9+
#define BLAMKA_ROUND \
10+
VMULWEVVWU V0, V2, V8; \
11+
VADDV V2, V0, V0; \
12+
VADDV V0, V8, V0; \
13+
VADDV V0, V8, V0; \
14+
VXORV V6, V0, V6; \
15+
VROTRV $32, V6, V6; \
16+
VMULWEVVWU V4, V6, V8; \
17+
VADDV V4, V6, V4; \
18+
VADDV V4, V8, V4; \
19+
VADDV V4, V8, V4; \
20+
VXORV V2, V4, V2; \
21+
VROTRV $24, V2, V2; \
22+
VMULWEVVWU V0, V2, V8; \
23+
VADDV V0, V2, V0; \
24+
VADDV V0, V8, V0; \
25+
VADDV V0, V8, V0; \
26+
VXORV V6, V0, V6; \
27+
VROTRV $16, V6, V6; \
28+
VMULWEVVWU V4, V6, V8; \
29+
VADDV V4, V6, V4; \
30+
VADDV V4, V8, V4; \
31+
VADDV V4, V8, V4; \
32+
VXORV V2, V4, V2; \
33+
VROTRV $63, V2, V2; \
34+
;\
35+
VMULWEVVWU V1, V3, V8; \
36+
VADDV V1, V3, V1; \
37+
VADDV V1, V8, V1; \
38+
VADDV V1, V8, V1; \
39+
VXORV V7, V1, V7; \
40+
VROTRV $32, V7, V7; \
41+
VMULWEVVWU V5, V7, V8; \
42+
VADDV V5, V7, V5; \
43+
VADDV V5, V8, V5; \
44+
VADDV V5, V8, V5; \
45+
VXORV V3, V5, V3; \
46+
VROTRV $24, V3, V3; \
47+
VMULWEVVWU V1, V3, V8; \
48+
VADDV V1, V3, V1; \
49+
VADDV V1, V8, V1; \
50+
VADDV V1, V8, V1; \
51+
VXORV V7, V1, V7; \
52+
VROTRV $16, V7, V7; \
53+
VMULWEVVWU V5, V7, V8; \
54+
VADDV V5, V7, V5; \
55+
VADDV V5, V8, V5; \
56+
VADDV V5, V8, V5; \
57+
VXORV V3, V5, V3; \
58+
VROTRV $63, V3, V3; \
59+
;\
60+
VXORV V0, V0, V8; \ // V8 = 0
61+
VADDV V2, V8, V9; \ // V9 = V2
62+
VADDV V5, V8, V10; \ // V10 = V5
63+
VADDV V6, V8, V11; \ // V11 = V6
64+
VADDV V4, V8, V5; \ // V5 = V4
65+
VADDV V10, V8, V4; \ // V4 = V5
66+
VSHUF4IV $9, V3, V2; \
67+
VSHUF4IV $9, V9, V3; \
68+
VSHUF4IV $3, V7, V6; \
69+
VSHUF4IV $3, V11, V7; \
70+
;\
71+
VMULWEVVWU V0, V2, V9; \
72+
VADDV V0, V2, V0; \
73+
VADDV V0, V9, V0; \
74+
VADDV V0, V9, V0; \
75+
VXORV V6, V0, V6; \
76+
VROTRV $32, V6, V6; \
77+
VMULWEVVWU V4, V6, V9; \
78+
VADDV V4, V6, V4; \
79+
VADDV V4, V9, V4; \
80+
VADDV V4, V9, V4; \
81+
VXORV V2, V4, V2; \
82+
VROTRV $24, V2, V2; \
83+
VMULWEVVWU V0, V2, V9; \
84+
VADDV V0, V2, V0; \
85+
VADDV V0, V9, V0; \
86+
VADDV V0, V9, V0; \
87+
VXORV V6, V0, V6; \
88+
VROTRV $16, V6, V6; \
89+
VMULWEVVWU V4, V6, V9; \
90+
VADDV V4, V6, V4; \
91+
VADDV V4, V9, V4; \
92+
VADDV V4, V9, V4; \
93+
VXORV V2, V4, V2; \
94+
VROTRV $63, V2, V2; \
95+
;\
96+
VMULWEVVWU V1, V3, V9; \
97+
VADDV V1, V3, V1; \
98+
VADDV V1, V9, V1; \
99+
VADDV V1, V9, V1; \
100+
VXORV V7, V1, V7; \
101+
VROTRV $32, V7, V7; \
102+
VMULWEVVWU V5, V7, V9; \
103+
VADDV V5, V7, V5; \
104+
VADDV V5, V9, V5; \
105+
VADDV V5, V9, V5; \
106+
VXORV V3, V5, V3; \
107+
VROTRV $24, V3, V3; \
108+
VMULWEVVWU V1, V3, V9; \
109+
VADDV V1, V3, V1; \
110+
VADDV V1, V9, V1; \
111+
VADDV V1, V9, V1; \
112+
VXORV V7, V1, V7; \
113+
VROTRV $16, V7, V7; \
114+
VMULWEVVWU V5, V7, V9; \
115+
VADDV V5, V7, V5; \
116+
VADDV V5, V9, V5; \
117+
VADDV V5, V9, V5; \
118+
VXORV V3, V5, V3; \
119+
VROTRV $63, V3, V3; \
120+
;\
121+
VADDV V2, V8, V9; \ // V9 = V2
122+
VADDV V5, V8, V10; \ // V10 = V5
123+
VADDV V6, V8, V11; \ // V11 = V6
124+
VADDV V4, V8, V5; \ // V5 = V4
125+
VADDV V10, V8, V4; \ // V4 = V5
126+
VSHUF4IV $3, V3, V2; \
127+
VSHUF4IV $3, V9, V3; \
128+
VSHUF4IV $9, V7, V6; \
129+
VSHUF4IV $9, V11, V7; \
130+
131+
#define BLAMKA_ROUND1(index) \
132+
VMOVQ (index+0)(R4), V0; \
133+
VMOVQ (index+16)(R4), V1; \
134+
VMOVQ (index+32)(R4), V2; \
135+
VMOVQ (index+48)(R4), V3; \
136+
VMOVQ (index+64)(R4), V4; \
137+
VMOVQ (index+80)(R4), V5; \
138+
VMOVQ (index+96)(R4), V6; \
139+
VMOVQ (index+112)(R4), V7; \
140+
BLAMKA_ROUND; \
141+
VMOVQ V0, (index+0)(R4); \
142+
VMOVQ V1, (index+16)(R4); \
143+
VMOVQ V2, (index+32)(R4); \
144+
VMOVQ V3, (index+48)(R4); \
145+
VMOVQ V4, (index+64)(R4); \
146+
VMOVQ V5, (index+80)(R4); \
147+
VMOVQ V6, (index+96)(R4); \
148+
VMOVQ V7, (index+112)(R4); \
149+
150+
#define BLAMKA_ROUND2(index) \
151+
VMOVQ (index+0)(R4), V0; \
152+
VMOVQ (index+128)(R4), V1; \
153+
VMOVQ (index+256)(R4), V2; \
154+
VMOVQ (index+384)(R4), V3; \
155+
VMOVQ (index+512)(R4), V4; \
156+
VMOVQ (index+640)(R4), V5; \
157+
VMOVQ (index+768)(R4), V6; \
158+
VMOVQ (index+896)(R4), V7; \
159+
BLAMKA_ROUND; \
160+
VMOVQ V0, (index+0)(R4); \
161+
VMOVQ V1, (index+128)(R4); \
162+
VMOVQ V2, (index+256)(R4); \
163+
VMOVQ V3, (index+384)(R4); \
164+
VMOVQ V4, (index+512)(R4); \
165+
VMOVQ V5, (index+640)(R4); \
166+
VMOVQ V6, (index+768)(R4); \
167+
VMOVQ V7, (index+896)(R4); \
168+
169+
// func blamkaVX(b *block)
170+
TEXT ·blamkaVX(SB), NOSPLIT, $0-8
171+
MOVV b+0(FP), R4
172+
173+
BLAMKA_ROUND1(0)
174+
BLAMKA_ROUND1(128)
175+
BLAMKA_ROUND1(256)
176+
BLAMKA_ROUND1(384)
177+
BLAMKA_ROUND1(512)
178+
BLAMKA_ROUND1(640)
179+
BLAMKA_ROUND1(768)
180+
BLAMKA_ROUND1(896)
181+
182+
BLAMKA_ROUND2(0)
183+
BLAMKA_ROUND2(16)
184+
BLAMKA_ROUND2(32)
185+
BLAMKA_ROUND2(48)
186+
BLAMKA_ROUND2(64)
187+
BLAMKA_ROUND2(80)
188+
BLAMKA_ROUND2(96)
189+
BLAMKA_ROUND2(112)
190+
191+
RET
192+
193+
// func mixBlocks1VX(t *block, in1 *block, in2 *block)
194+
TEXT ·mixBlocks1VX(SB), NOSPLIT, $0-24
195+
MOVV t+0(FP), R4
196+
MOVV in1+8(FP), R5
197+
MOVV in2+16(FP), R6
198+
MOVV $128, R8
199+
200+
loop:
201+
VMOVQ (R5), V0
202+
VMOVQ (R6), V1
203+
VXORV V0, V1, V2
204+
VMOVQ V2, (R4)
205+
ADDV $16, R5
206+
ADDV $16, R6
207+
ADDV $16, R4
208+
SUBV $2, R8
209+
BLT R0, R8, loop
210+
RET
211+
212+
// func mixBlocks2VX(out *block, in1 *block, in2 *block, t *block)
213+
TEXT ·mixBlocks2VX(SB), NOSPLIT, $0-32
214+
MOVV out+0(FP), R4
215+
MOVV in1+8(FP), R5
216+
MOVV in2+16(FP), R6
217+
MOVV t+24(FP), R7
218+
MOVV $128, R8
219+
220+
loop:
221+
VMOVQ (R5), V0
222+
VMOVQ (R6), V1
223+
VMOVQ (R7), V2
224+
VXORV V0, V1, V3
225+
VXORV V3, V2, V4
226+
VMOVQ V4, (R4)
227+
ADDV $16, R5
228+
ADDV $16, R6
229+
ADDV $16, R7
230+
ADDV $16, R4
231+
SUBV $2, R8
232+
BLT R0, R8, loop
233+
RET
234+
235+
// func xorBlocksVX(out *block, in1 *block, in2 *block, t *block)
236+
TEXT ·xorBlocksVX(SB), NOSPLIT, $0-32
237+
MOVV out+0(FP), R4
238+
MOVV in1+8(FP), R5
239+
MOVV in2+16(FP), R6
240+
MOVV t+24(FP), R7
241+
MOVV $128, R8
242+
243+
loop:
244+
VMOVQ (R5), V0
245+
VMOVQ (R6), V1
246+
VMOVQ (R7), V2
247+
VMOVQ (R4), V3
248+
VXORV V0, V1, V4
249+
VXORV V4, V2, V5
250+
VXORV V5, V3, V6
251+
VMOVQ V6, (R4)
252+
ADDV $16, R5
253+
ADDV $16, R6
254+
ADDV $16, R7
255+
ADDV $16, R4
256+
SUBV $2, R8
257+
BLT R0, R8, loop
258+
RET

argon2/blamka_ref.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build !amd64 || purego || !gc
5+
//go:build (!amd64 && !loong64) || purego || !gc
66

77
package argon2
88

0 commit comments

Comments
 (0)