Skip to content

Commit 7f516a3

Browse files
rscgopherbot
authored andcommitted
math/big: replace assembly with mini-compiler output
Step 4 of the mini-compiler: switch to the new generated assembly. No systematic performance regressions, and many many improvements. In the benchmarks, the systems are: c3h88 GOARCH=amd64 c3h88 perf gomote (newer Intel, Google Cloud) c2s16 GOARCH=amd64 c2s16 perf gomote (Intel, Google Cloud) s7 GOARCH=amd64 rsc basement server (AMD Ryzen 9 7950X) 386 GOARCH=386 gotip-linux-386 gomote (Intel, Google Cloud) s7-386 GOARCH=386 rsc basement server (AMD Ryzen 9 7950X) c4as16 GOARCH=arm64 c4as16 perf gomote (Google Cloud) mac GOARCH=arm64 Apple M3 Pro in MacBook Pro arm GOARCH=arm gotip-linux-arm gomote loong64 GOARCH=loong64 gotip-linux-loong64 gomote ppc64le GOARCH=ppc64le gotip-linux-ppc64le gomote riscv64 GOARCH=riscv64 gotip-linux-riscv64 gomote s390x GOARCH=s390x linux-s390x-ibm old gomote benchmark \ system c3h88 c2s16 s7 386 s7-386 c4as16 mac arm loong64 ppc64le riscv64 s390x AddVV/words=1 -4.03% +5.21% -4.04% +4.94% ~ ~ ~ ~ -19.51% ~ ~ ~ AddVV/words=10 -10.20% +0.34% -3.46% -11.50% -7.46% +7.66% +5.97% ~ -17.90% ~ ~ ~ AddVV/words=16 -10.91% -6.45% -8.45% -21.86% -17.90% +2.73% -1.61% ~ -22.47% -3.54% ~ ~ AddVV/words=100 -3.77% -4.30% -3.17% -47.27% -45.34% -0.78% ~ -8.74% -27.19% ~ ~ ~ AddVV/words=1000 -0.08% -0.71% ~ -49.21% -48.07% ~ ~ -16.80% -24.74% ~ ~ ~ AddVV/words=10000 ~ ~ ~ -48.73% -48.56% -0.06% ~ -17.08% ~ ~ -4.81% ~ AddVV/words=100000 ~ ~ ~ -47.80% -48.38% ~ ~ -15.10% -25.06% ~ -5.34% ~ SubVV/words=1 -0.84% +3.43% -3.62% +1.34% ~ -0.76% ~ ~ -18.18% +5.58% ~ ~ SubVV/words=10 -9.99% +0.34% ~ -11.23% -8.24% +7.53% +6.15% ~ -17.55% +2.77% -2.08% ~ SubVV/words=16 -11.94% -6.45% -6.81% -21.82% -18.11% +1.58% -1.21% ~ -20.36% ~ ~ ~ SubVV/words=100 -3.38% -4.32% -1.80% -46.14% -46.43% +0.41% ~ -7.20% -26.17% ~ -0.42% ~ SubVV/words=1000 -0.38% -0.80% ~ -49.22% -48.90% ~ ~ -15.86% -24.73% ~ ~ ~ SubVV/words=10000 ~ ~ ~ -49.57% -49.64% -0.03% ~ -15.85% -26.52% ~ -5.05% ~ SubVV/words=100000 ~ ~ ~ -46.88% -49.66% ~ ~ -15.45% -16.11% ~ -4.99% ~ LshVU/words=1 ~ +5.78% ~ ~ -2.48% +1.61% +2.18% +2.70% -18.16% -34.16% -21.29% ~ LshVU/words=10 -18.34% -3.78% +2.21% ~ ~ -2.81% -12.54% ~ -25.02% -24.78% -38.11% -66.98% LshVU/words=16 -23.15% +1.03% +7.74% +0.73% ~ +8.88% +1.56% ~ -25.37% -28.46% -41.27% ~ LshVU/words=100 -32.85% -8.86% -2.58% ~ +2.69% +1.24% ~ -20.63% -44.14% -42.68% -53.09% ~ LshVU/words=1000 -37.30% -0.20% +5.67% ~ ~ +1.44% ~ -27.83% -45.01% -37.07% -57.02% -46.57% LshVU/words=10000 -36.84% -2.30% +3.82% ~ +1.86% +1.57% -66.81% -28.00% -13.15% -35.40% -41.97% ~ LshVU/words=100000 -40.30% ~ +3.96% ~ ~ ~ ~ -24.91% -19.06% -36.14% -40.99% -66.03% RshVU/words=1 -3.17% +4.76% -4.06% +4.31% +4.55% ~ ~ ~ -20.61% ~ -26.20% -51.33% RshVU/words=10 -22.08% -4.41% -17.99% +3.64% -11.87% ~ -16.30% ~ -30.01% ~ -40.37% -63.05% RshVU/words=16 -26.03% -8.50% -18.09% ~ -17.52% +6.50% ~ -2.85% -30.24% ~ -42.93% -63.13% RshVU/words=100 -20.87% -28.83% -29.45% ~ -26.25% +1.46% -1.14% -16.20% -45.65% -16.20% -53.66% -77.27% RshVU/words=1000 -24.03% -21.37% -26.71% ~ -28.95% +0.98% ~ -18.82% -45.21% -23.55% -57.09% -71.18% RshVU/words=10000 -24.56% -22.44% -27.01% ~ -28.88% +0.78% -5.35% -17.47% -16.87% -20.67% -41.97% ~ RshVU/words=100000 -23.36% -15.65% -27.54% ~ -29.26% +1.73% -6.67% -13.68% -21.40% -23.02% -40.37% -66.31% MulAddVWW/words=1 +2.37% +8.14% ~ +4.10% +3.71% ~ ~ ~ -21.62% ~ +1.12% ~ MulAddVWW/words=10 ~ -2.72% -15.15% +8.04% ~ ~ ~ -2.52% -19.48% ~ -6.18% ~ MulAddVWW/words=16 ~ +1.49% ~ +4.49% +6.58% -8.70% -7.16% -12.08% -21.43% -6.59% -9.05% ~ MulAddVWW/words=100 +0.37% +1.11% -4.51% -13.59% ~ -11.10% -3.63% -21.40% -22.27% -2.92% -14.41% ~ MulAddVWW/words=1000 ~ +0.90% -7.13% -18.94% ~ -14.02% -9.97% -28.31% -18.72% -2.32% -15.80% ~ MulAddVWW/words=10000 ~ +1.08% -6.75% -19.10% ~ -14.61% -9.04% -28.48% -14.29% -2.25% -9.40% ~ MulAddVWW/words=100000 ~ ~ -6.93% -18.09% ~ -14.33% -9.66% -28.92% -16.63% -2.43% -8.23% ~ AddMulVVWW/words=1 +2.30% +4.83% -11.37% +4.58% ~ -3.14% ~ ~ -10.58% +30.35% ~ ~ AddMulVVWW/words=10 -3.27% ~ +8.96% +5.74% ~ +2.67% -1.44% -7.64% -13.41% ~ ~ ~ AddMulVVWW/words=16 -6.12% ~ ~ ~ +1.91% -7.90% -16.22% -14.07% -14.26% -4.15% -7.30% ~ AddMulVVWW/words=100 -5.48% -2.14% ~ -9.40% +9.98% -1.43% -12.35% -18.56% -21.94% ~ -9.84% ~ AddMulVVWW/words=1000 -11.35% -3.40% -3.64% -11.04% +12.82% -1.33% -15.63% -20.50% -20.95% ~ -11.06% -51.97% AddMulVVWW/words=10000 -10.31% -1.61% -8.41% -12.15% +13.10% -1.03% -16.34% -22.46% -1.00% ~ -10.33% -49.80% AddMulVVWW/words=100000 -13.71% ~ -8.31% -12.18% +12.98% -1.35% -15.20% -21.89% ~ ~ -9.38% -48.30% Change-Id: I0a33c33602c0d053c84d9946e662500cfa048e2d Reviewed-on: https://go-review.googlesource.com/c/go/+/664938 Reviewed-by: Alan Donovan <adonovan@google.com> Auto-Submit: Russ Cox <rsc@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
1 parent 39070da commit 7f516a3

17 files changed

+4196
-2519
lines changed

src/math/big/arith_386.s

Lines changed: 205 additions & 157 deletions
Original file line numberDiff line numberDiff line change
@@ -1,192 +1,240 @@
1-
// Copyright 2009 The Go Authors. All rights reserved.
1+
// Copyright 2025 The Go Authors. All rights reserved.
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5+
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
6+
57
//go:build !math_big_pure_go
68

79
#include "textflag.h"
810

9-
// This file provides fast assembly versions for the elementary
10-
// arithmetic operations on vectors implemented in arith.go.
11-
1211
// func addVV(z, x, y []Word) (c Word)
13-
TEXT ·addVV(SB),NOSPLIT,$0
14-
MOVL z+0(FP), DI
15-
MOVL x+12(FP), SI
16-
MOVL y+24(FP), CX
17-
MOVL z_len+4(FP), BP
18-
MOVL $0, BX // i = 0
19-
MOVL $0, DX // c = 0
20-
JMP E1
21-
22-
L1: MOVL (SI)(BX*4), AX
23-
ADDL DX, DX // restore CF
24-
ADCL (CX)(BX*4), AX
25-
SBBL DX, DX // save CF
26-
MOVL AX, (DI)(BX*4)
27-
ADDL $1, BX // i++
28-
29-
E1: CMPL BX, BP // i < n
30-
JL L1
31-
32-
NEGL DX
12+
TEXT ·addVV(SB), NOSPLIT, $0
13+
MOVL z_len+4(FP), BX
14+
MOVL x_base+12(FP), SI
15+
MOVL y_base+24(FP), DI
16+
MOVL z_base+0(FP), BP
17+
// compute unrolled loop lengths
18+
MOVL BX, CX
19+
ANDL $3, CX
20+
SHRL $2, BX
21+
MOVL $0, DX // clear saved carry
22+
loop1:
23+
TESTL CX, CX; JZ loop1done
24+
loop1cont:
25+
// unroll 1X in batches of 1
26+
ADDL DX, DX // restore carry
27+
MOVL 0(SI), DX
28+
ADCL 0(DI), DX
29+
MOVL DX, 0(BP)
30+
SBBL DX, DX // save carry
31+
LEAL 4(SI), SI // ADD $4, SI
32+
LEAL 4(DI), DI // ADD $4, DI
33+
LEAL 4(BP), BP // ADD $4, BP
34+
SUBL $1, CX; JNZ loop1cont
35+
loop1done:
36+
loop4:
37+
TESTL BX, BX; JZ loop4done
38+
loop4cont:
39+
// unroll 4X in batches of 1
40+
ADDL DX, DX // restore carry
41+
MOVL 0(SI), CX
42+
ADCL 0(DI), CX
43+
MOVL CX, 0(BP)
44+
MOVL 4(SI), CX
45+
ADCL 4(DI), CX
46+
MOVL CX, 4(BP)
47+
MOVL 8(SI), CX
48+
ADCL 8(DI), CX
49+
MOVL CX, 8(BP)
50+
MOVL 12(SI), CX
51+
ADCL 12(DI), CX
52+
MOVL CX, 12(BP)
53+
SBBL DX, DX // save carry
54+
LEAL 16(SI), SI // ADD $16, SI
55+
LEAL 16(DI), DI // ADD $16, DI
56+
LEAL 16(BP), BP // ADD $16, BP
57+
SUBL $1, BX; JNZ loop4cont
58+
loop4done:
59+
NEGL DX // convert add carry
3360
MOVL DX, c+36(FP)
3461
RET
3562

36-
3763
// func subVV(z, x, y []Word) (c Word)
38-
// (same as addVV except for SBBL instead of ADCL and label names)
39-
TEXT ·subVV(SB),NOSPLIT,$0
40-
MOVL z+0(FP), DI
41-
MOVL x+12(FP), SI
42-
MOVL y+24(FP), CX
43-
MOVL z_len+4(FP), BP
44-
MOVL $0, BX // i = 0
45-
MOVL $0, DX // c = 0
46-
JMP E2
47-
48-
L2: MOVL (SI)(BX*4), AX
49-
ADDL DX, DX // restore CF
50-
SBBL (CX)(BX*4), AX
51-
SBBL DX, DX // save CF
52-
MOVL AX, (DI)(BX*4)
53-
ADDL $1, BX // i++
54-
55-
E2: CMPL BX, BP // i < n
56-
JL L2
57-
58-
NEGL DX
64+
TEXT ·subVV(SB), NOSPLIT, $0
65+
MOVL z_len+4(FP), BX
66+
MOVL x_base+12(FP), SI
67+
MOVL y_base+24(FP), DI
68+
MOVL z_base+0(FP), BP
69+
// compute unrolled loop lengths
70+
MOVL BX, CX
71+
ANDL $3, CX
72+
SHRL $2, BX
73+
MOVL $0, DX // clear saved carry
74+
loop1:
75+
TESTL CX, CX; JZ loop1done
76+
loop1cont:
77+
// unroll 1X in batches of 1
78+
ADDL DX, DX // restore carry
79+
MOVL 0(SI), DX
80+
SBBL 0(DI), DX
81+
MOVL DX, 0(BP)
82+
SBBL DX, DX // save carry
83+
LEAL 4(SI), SI // ADD $4, SI
84+
LEAL 4(DI), DI // ADD $4, DI
85+
LEAL 4(BP), BP // ADD $4, BP
86+
SUBL $1, CX; JNZ loop1cont
87+
loop1done:
88+
loop4:
89+
TESTL BX, BX; JZ loop4done
90+
loop4cont:
91+
// unroll 4X in batches of 1
92+
ADDL DX, DX // restore carry
93+
MOVL 0(SI), CX
94+
SBBL 0(DI), CX
95+
MOVL CX, 0(BP)
96+
MOVL 4(SI), CX
97+
SBBL 4(DI), CX
98+
MOVL CX, 4(BP)
99+
MOVL 8(SI), CX
100+
SBBL 8(DI), CX
101+
MOVL CX, 8(BP)
102+
MOVL 12(SI), CX
103+
SBBL 12(DI), CX
104+
MOVL CX, 12(BP)
105+
SBBL DX, DX // save carry
106+
LEAL 16(SI), SI // ADD $16, SI
107+
LEAL 16(DI), DI // ADD $16, DI
108+
LEAL 16(BP), BP // ADD $16, BP
109+
SUBL $1, BX; JNZ loop4cont
110+
loop4done:
111+
NEGL DX // convert sub carry
59112
MOVL DX, c+36(FP)
60113
RET
61114

62-
63115
// func lshVU(z, x []Word, s uint) (c Word)
64-
TEXT ·lshVU(SB),NOSPLIT,$0
65-
MOVL z_len+4(FP), BX // i = z
66-
SUBL $1, BX // i--
67-
JL X8b // i < 0 (n <= 0)
68-
69-
// n > 0
70-
MOVL z+0(FP), DI
71-
MOVL x+12(FP), SI
116+
TEXT ·lshVU(SB), NOSPLIT, $0
117+
MOVL z_len+4(FP), BX
118+
TESTL BX, BX; JZ ret0
72119
MOVL s+24(FP), CX
73-
MOVL (SI)(BX*4), AX // w1 = x[n-1]
120+
MOVL x_base+12(FP), SI
121+
MOVL z_base+0(FP), DI
122+
// run loop backward, using counter as positive index
123+
// shift first word into carry
124+
MOVL -4(SI)(BX*4), BP
74125
MOVL $0, DX
75-
SHLL CX, AX, DX // w1>>ŝ
126+
SHLL CX, BP, DX
76127
MOVL DX, c+28(FP)
77-
78-
CMPL BX, $0
79-
JLE X8a // i <= 0
80-
81-
// i > 0
82-
L8: MOVL AX, DX // w = w1
83-
MOVL -4(SI)(BX*4), AX // w1 = x[i-1]
84-
SHLL CX, AX, DX // w<<s | w1>>ŝ
85-
MOVL DX, (DI)(BX*4) // z[i] = w<<s | w1>>ŝ
86-
SUBL $1, BX // i--
87-
JG L8 // i > 0
88-
89-
// i <= 0
90-
X8a: SHLL CX, AX // w1<<s
91-
MOVL AX, (DI) // z[0] = w1<<s
128+
// shift remaining words
129+
SUBL $1, BX
130+
loop1:
131+
TESTL BX, BX; JZ loop1done
132+
loop1cont:
133+
// unroll 1X in batches of 1
134+
MOVL -4(SI)(BX*4), DX
135+
SHLL CX, DX, BP
136+
MOVL BP, 0(DI)(BX*4)
137+
MOVL DX, BP
138+
SUBL $1, BX; JNZ loop1cont
139+
loop1done:
140+
// store final shifted bits
141+
SHLL CX, BP
142+
MOVL BP, 0(DI)(BX*4)
92143
RET
93-
94-
X8b: MOVL $0, c+28(FP)
144+
ret0:
145+
MOVL $0, c+28(FP)
95146
RET
96147

97-
98148
// func rshVU(z, x []Word, s uint) (c Word)
99-
TEXT ·rshVU(SB),NOSPLIT,$0
100-
MOVL z_len+4(FP), BP
101-
SUBL $1, BP // n--
102-
JL X9b // n < 0 (n <= 0)
103-
104-
// n > 0
105-
MOVL z+0(FP), DI
106-
MOVL x+12(FP), SI
149+
TEXT ·rshVU(SB), NOSPLIT, $0
150+
MOVL z_len+4(FP), BX
151+
TESTL BX, BX; JZ ret0
107152
MOVL s+24(FP), CX
108-
MOVL (SI), AX // w1 = x[0]
153+
MOVL x_base+12(FP), SI
154+
MOVL z_base+0(FP), DI
155+
// use counter as negative index
156+
LEAL (SI)(BX*4), SI
157+
LEAL (DI)(BX*4), DI
158+
NEGL BX
159+
// shift first word into carry
160+
MOVL 0(SI)(BX*4), BP
109161
MOVL $0, DX
110-
SHRL CX, AX, DX // w1<<ŝ
162+
SHRL CX, BP, DX
111163
MOVL DX, c+28(FP)
112-
113-
MOVL $0, BX // i = 0
114-
JMP E9
115-
116-
// i < n-1
117-
L9: MOVL AX, DX // w = w1
118-
MOVL 4(SI)(BX*4), AX // w1 = x[i+1]
119-
SHRL CX, AX, DX // w>>s | w1<<ŝ
120-
MOVL DX, (DI)(BX*4) // z[i] = w>>s | w1<<ŝ
121-
ADDL $1, BX // i++
122-
123-
E9: CMPL BX, BP
124-
JL L9 // i < n-1
125-
126-
// i >= n-1
127-
X9a: SHRL CX, AX // w1>>s
128-
MOVL AX, (DI)(BP*4) // z[n-1] = w1>>s
164+
// shift remaining words
165+
ADDL $1, BX
166+
loop1:
167+
TESTL BX, BX; JZ loop1done
168+
loop1cont:
169+
// unroll 1X in batches of 1
170+
MOVL 0(SI)(BX*4), DX
171+
SHRL CX, DX, BP
172+
MOVL BP, -4(DI)(BX*4)
173+
MOVL DX, BP
174+
ADDL $1, BX; JNZ loop1cont
175+
loop1done:
176+
// store final shifted bits
177+
SHRL CX, BP
178+
MOVL BP, -4(DI)(BX*4)
129179
RET
130-
131-
X9b: MOVL $0, c+28(FP)
180+
ret0:
181+
MOVL $0, c+28(FP)
132182
RET
133183

134-
135184
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
136-
TEXT ·mulAddVWW(SB),NOSPLIT,$0
137-
MOVL z+0(FP), DI
138-
MOVL x+12(FP), SI
139-
MOVL m+24(FP), BP
140-
MOVL a+28(FP), CX // c = a
141-
MOVL z_len+4(FP), BX
142-
LEAL (DI)(BX*4), DI
143-
LEAL (SI)(BX*4), SI
144-
NEGL BX // i = -n
145-
JMP E5
146-
147-
L5: MOVL (SI)(BX*4), AX
148-
MULL BP
149-
ADDL CX, AX
150-
ADCL $0, DX
151-
MOVL AX, (DI)(BX*4)
152-
MOVL DX, CX
153-
ADDL $1, BX // i++
154-
155-
E5: CMPL BX, $0 // i < 0
156-
JL L5
157-
158-
MOVL CX, c+32(FP)
185+
TEXT ·mulAddVWW(SB), NOSPLIT, $0
186+
MOVL m+24(FP), BX
187+
MOVL a+28(FP), SI
188+
MOVL z_len+4(FP), DI
189+
MOVL x_base+12(FP), BP
190+
MOVL z_base+0(FP), CX
191+
// use counter as negative index
192+
LEAL (BP)(DI*4), BP
193+
LEAL (CX)(DI*4), CX
194+
NEGL DI
195+
loop1:
196+
TESTL DI, DI; JZ loop1done
197+
loop1cont:
198+
// unroll 1X in batches of 1
199+
MOVL 0(BP)(DI*4), AX
200+
// multiply
201+
MULL BX
202+
ADDL SI, AX
203+
MOVL DX, SI
204+
ADCL $0, SI
205+
MOVL AX, 0(CX)(DI*4)
206+
ADDL $1, DI; JNZ loop1cont
207+
loop1done:
208+
MOVL SI, c+32(FP)
159209
RET
160210

161-
162211
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
163-
TEXT ·addMulVVWW(SB),NOSPLIT,$0
164-
MOVL z+0(FP), BP
165-
MOVL x+12(FP), DI
166-
MOVL y+24(FP), SI
167-
MOVL a+40(FP), CX
168-
MOVL z_len+4(FP), BX
169-
LEAL (DI)(BX*4), DI
170-
LEAL (SI)(BX*4), SI
171-
LEAL (BP)(BX*4), BP
172-
NEGL BX // i = -n
173-
JMP E6
174-
175-
L6: MOVL (SI)(BX*4), AX
212+
TEXT ·addMulVVWW(SB), NOSPLIT, $0
213+
MOVL a+40(FP), BX
214+
MOVL z_len+4(FP), SI
215+
MOVL x_base+12(FP), DI
216+
MOVL y_base+24(FP), BP
217+
MOVL z_base+0(FP), CX
218+
// use counter as negative index
219+
LEAL (DI)(SI*4), DI
220+
LEAL (BP)(SI*4), BP
221+
LEAL (CX)(SI*4), CX
222+
NEGL SI
223+
loop1:
224+
TESTL SI, SI; JZ loop1done
225+
loop1cont:
226+
// unroll 1X in batches of 1
227+
MOVL 0(BP)(SI*4), AX
228+
// multiply
176229
MULL m+36(FP)
177-
ADDL CX, AX
178-
ADCL $0, DX
179-
ADDL (DI)(BX*4), AX
180-
MOVL AX, (BP)(BX*4)
181-
ADCL $0, DX
182-
MOVL DX, CX
183-
ADDL $1, BX // i++
184-
185-
E6: CMPL BX, $0 // i < 0
186-
JL L6
187-
188-
MOVL CX, c+44(FP)
230+
ADDL BX, AX
231+
MOVL DX, BX
232+
ADCL $0, BX
233+
// add
234+
ADDL 0(DI)(SI*4), AX
235+
ADCL $0, BX
236+
MOVL AX, 0(CX)(SI*4)
237+
ADDL $1, SI; JNZ loop1cont
238+
loop1done:
239+
MOVL BX, c+44(FP)
189240
RET
190-
191-
192-

0 commit comments

Comments
 (0)