|
1 |
| -// Copyright 2009 The Go Authors. All rights reserved. |
| 1 | +// Copyright 2025 The Go Authors. All rights reserved. |
2 | 2 | // Use of this source code is governed by a BSD-style
|
3 | 3 | // license that can be found in the LICENSE file.
|
4 | 4 |
|
| 5 | +// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT. |
| 6 | + |
5 | 7 | //go:build !math_big_pure_go
|
6 | 8 |
|
7 | 9 | #include "textflag.h"
|
8 | 10 |
|
9 |
| -// This file provides fast assembly versions for the elementary |
10 |
| -// arithmetic operations on vectors implemented in arith.go. |
11 |
| - |
12 | 11 | // func addVV(z, x, y []Word) (c Word)
|
13 |
| -TEXT ·addVV(SB),NOSPLIT,$0 |
14 |
| - MOVL z+0(FP), DI |
15 |
| - MOVL x+12(FP), SI |
16 |
| - MOVL y+24(FP), CX |
17 |
| - MOVL z_len+4(FP), BP |
18 |
| - MOVL $0, BX // i = 0 |
19 |
| - MOVL $0, DX // c = 0 |
20 |
| - JMP E1 |
21 |
| - |
22 |
| -L1: MOVL (SI)(BX*4), AX |
23 |
| - ADDL DX, DX // restore CF |
24 |
| - ADCL (CX)(BX*4), AX |
25 |
| - SBBL DX, DX // save CF |
26 |
| - MOVL AX, (DI)(BX*4) |
27 |
| - ADDL $1, BX // i++ |
28 |
| - |
29 |
| -E1: CMPL BX, BP // i < n |
30 |
| - JL L1 |
31 |
| - |
32 |
| - NEGL DX |
| 12 | +TEXT ·addVV(SB), NOSPLIT, $0 |
| 13 | + MOVL z_len+4(FP), BX |
| 14 | + MOVL x_base+12(FP), SI |
| 15 | + MOVL y_base+24(FP), DI |
| 16 | + MOVL z_base+0(FP), BP |
| 17 | + // compute unrolled loop lengths |
| 18 | + MOVL BX, CX |
| 19 | + ANDL $3, CX |
| 20 | + SHRL $2, BX |
| 21 | + MOVL $0, DX // clear saved carry |
| 22 | +loop1: |
| 23 | + TESTL CX, CX; JZ loop1done |
| 24 | +loop1cont: |
| 25 | + // unroll 1X in batches of 1 |
| 26 | + ADDL DX, DX // restore carry |
| 27 | + MOVL 0(SI), DX |
| 28 | + ADCL 0(DI), DX |
| 29 | + MOVL DX, 0(BP) |
| 30 | + SBBL DX, DX // save carry |
| 31 | + LEAL 4(SI), SI // ADD $4, SI |
| 32 | + LEAL 4(DI), DI // ADD $4, DI |
| 33 | + LEAL 4(BP), BP // ADD $4, BP |
| 34 | + SUBL $1, CX; JNZ loop1cont |
| 35 | +loop1done: |
| 36 | +loop4: |
| 37 | + TESTL BX, BX; JZ loop4done |
| 38 | +loop4cont: |
| 39 | + // unroll 4X in batches of 1 |
| 40 | + ADDL DX, DX // restore carry |
| 41 | + MOVL 0(SI), CX |
| 42 | + ADCL 0(DI), CX |
| 43 | + MOVL CX, 0(BP) |
| 44 | + MOVL 4(SI), CX |
| 45 | + ADCL 4(DI), CX |
| 46 | + MOVL CX, 4(BP) |
| 47 | + MOVL 8(SI), CX |
| 48 | + ADCL 8(DI), CX |
| 49 | + MOVL CX, 8(BP) |
| 50 | + MOVL 12(SI), CX |
| 51 | + ADCL 12(DI), CX |
| 52 | + MOVL CX, 12(BP) |
| 53 | + SBBL DX, DX // save carry |
| 54 | + LEAL 16(SI), SI // ADD $16, SI |
| 55 | + LEAL 16(DI), DI // ADD $16, DI |
| 56 | + LEAL 16(BP), BP // ADD $16, BP |
| 57 | + SUBL $1, BX; JNZ loop4cont |
| 58 | +loop4done: |
| 59 | + NEGL DX // convert add carry |
33 | 60 | MOVL DX, c+36(FP)
|
34 | 61 | RET
|
35 | 62 |
|
36 |
| - |
37 | 63 | // func subVV(z, x, y []Word) (c Word)
|
38 |
| -// (same as addVV except for SBBL instead of ADCL and label names) |
39 |
| -TEXT ·subVV(SB),NOSPLIT,$0 |
40 |
| - MOVL z+0(FP), DI |
41 |
| - MOVL x+12(FP), SI |
42 |
| - MOVL y+24(FP), CX |
43 |
| - MOVL z_len+4(FP), BP |
44 |
| - MOVL $0, BX // i = 0 |
45 |
| - MOVL $0, DX // c = 0 |
46 |
| - JMP E2 |
47 |
| - |
48 |
| -L2: MOVL (SI)(BX*4), AX |
49 |
| - ADDL DX, DX // restore CF |
50 |
| - SBBL (CX)(BX*4), AX |
51 |
| - SBBL DX, DX // save CF |
52 |
| - MOVL AX, (DI)(BX*4) |
53 |
| - ADDL $1, BX // i++ |
54 |
| - |
55 |
| -E2: CMPL BX, BP // i < n |
56 |
| - JL L2 |
57 |
| - |
58 |
| - NEGL DX |
| 64 | +TEXT ·subVV(SB), NOSPLIT, $0 |
| 65 | + MOVL z_len+4(FP), BX |
| 66 | + MOVL x_base+12(FP), SI |
| 67 | + MOVL y_base+24(FP), DI |
| 68 | + MOVL z_base+0(FP), BP |
| 69 | + // compute unrolled loop lengths |
| 70 | + MOVL BX, CX |
| 71 | + ANDL $3, CX |
| 72 | + SHRL $2, BX |
| 73 | + MOVL $0, DX // clear saved carry |
| 74 | +loop1: |
| 75 | + TESTL CX, CX; JZ loop1done |
| 76 | +loop1cont: |
| 77 | + // unroll 1X in batches of 1 |
| 78 | + ADDL DX, DX // restore carry |
| 79 | + MOVL 0(SI), DX |
| 80 | + SBBL 0(DI), DX |
| 81 | + MOVL DX, 0(BP) |
| 82 | + SBBL DX, DX // save carry |
| 83 | + LEAL 4(SI), SI // ADD $4, SI |
| 84 | + LEAL 4(DI), DI // ADD $4, DI |
| 85 | + LEAL 4(BP), BP // ADD $4, BP |
| 86 | + SUBL $1, CX; JNZ loop1cont |
| 87 | +loop1done: |
| 88 | +loop4: |
| 89 | + TESTL BX, BX; JZ loop4done |
| 90 | +loop4cont: |
| 91 | + // unroll 4X in batches of 1 |
| 92 | + ADDL DX, DX // restore carry |
| 93 | + MOVL 0(SI), CX |
| 94 | + SBBL 0(DI), CX |
| 95 | + MOVL CX, 0(BP) |
| 96 | + MOVL 4(SI), CX |
| 97 | + SBBL 4(DI), CX |
| 98 | + MOVL CX, 4(BP) |
| 99 | + MOVL 8(SI), CX |
| 100 | + SBBL 8(DI), CX |
| 101 | + MOVL CX, 8(BP) |
| 102 | + MOVL 12(SI), CX |
| 103 | + SBBL 12(DI), CX |
| 104 | + MOVL CX, 12(BP) |
| 105 | + SBBL DX, DX // save carry |
| 106 | + LEAL 16(SI), SI // ADD $16, SI |
| 107 | + LEAL 16(DI), DI // ADD $16, DI |
| 108 | + LEAL 16(BP), BP // ADD $16, BP |
| 109 | + SUBL $1, BX; JNZ loop4cont |
| 110 | +loop4done: |
| 111 | + NEGL DX // convert sub carry |
59 | 112 | MOVL DX, c+36(FP)
|
60 | 113 | RET
|
61 | 114 |
|
62 |
| - |
63 | 115 | // func lshVU(z, x []Word, s uint) (c Word)
|
64 |
| -TEXT ·lshVU(SB),NOSPLIT,$0 |
65 |
| - MOVL z_len+4(FP), BX // i = z |
66 |
| - SUBL $1, BX // i-- |
67 |
| - JL X8b // i < 0 (n <= 0) |
68 |
| - |
69 |
| - // n > 0 |
70 |
| - MOVL z+0(FP), DI |
71 |
| - MOVL x+12(FP), SI |
| 116 | +TEXT ·lshVU(SB), NOSPLIT, $0 |
| 117 | + MOVL z_len+4(FP), BX |
| 118 | + TESTL BX, BX; JZ ret0 |
72 | 119 | MOVL s+24(FP), CX
|
73 |
| - MOVL (SI)(BX*4), AX // w1 = x[n-1] |
| 120 | + MOVL x_base+12(FP), SI |
| 121 | + MOVL z_base+0(FP), DI |
| 122 | + // run loop backward, using counter as positive index |
| 123 | + // shift first word into carry |
| 124 | + MOVL -4(SI)(BX*4), BP |
74 | 125 | MOVL $0, DX
|
75 |
| - SHLL CX, AX, DX // w1>>ŝ |
| 126 | + SHLL CX, BP, DX |
76 | 127 | MOVL DX, c+28(FP)
|
77 |
| - |
78 |
| - CMPL BX, $0 |
79 |
| - JLE X8a // i <= 0 |
80 |
| - |
81 |
| - // i > 0 |
82 |
| -L8: MOVL AX, DX // w = w1 |
83 |
| - MOVL -4(SI)(BX*4), AX // w1 = x[i-1] |
84 |
| - SHLL CX, AX, DX // w<<s | w1>>ŝ |
85 |
| - MOVL DX, (DI)(BX*4) // z[i] = w<<s | w1>>ŝ |
86 |
| - SUBL $1, BX // i-- |
87 |
| - JG L8 // i > 0 |
88 |
| - |
89 |
| - // i <= 0 |
90 |
| -X8a: SHLL CX, AX // w1<<s |
91 |
| - MOVL AX, (DI) // z[0] = w1<<s |
| 128 | + // shift remaining words |
| 129 | + SUBL $1, BX |
| 130 | +loop1: |
| 131 | + TESTL BX, BX; JZ loop1done |
| 132 | +loop1cont: |
| 133 | + // unroll 1X in batches of 1 |
| 134 | + MOVL -4(SI)(BX*4), DX |
| 135 | + SHLL CX, DX, BP |
| 136 | + MOVL BP, 0(DI)(BX*4) |
| 137 | + MOVL DX, BP |
| 138 | + SUBL $1, BX; JNZ loop1cont |
| 139 | +loop1done: |
| 140 | + // store final shifted bits |
| 141 | + SHLL CX, BP |
| 142 | + MOVL BP, 0(DI)(BX*4) |
92 | 143 | RET
|
93 |
| - |
94 |
| -X8b: MOVL $0, c+28(FP) |
| 144 | +ret0: |
| 145 | + MOVL $0, c+28(FP) |
95 | 146 | RET
|
96 | 147 |
|
97 |
| - |
98 | 148 | // func rshVU(z, x []Word, s uint) (c Word)
|
99 |
| -TEXT ·rshVU(SB),NOSPLIT,$0 |
100 |
| - MOVL z_len+4(FP), BP |
101 |
| - SUBL $1, BP // n-- |
102 |
| - JL X9b // n < 0 (n <= 0) |
103 |
| - |
104 |
| - // n > 0 |
105 |
| - MOVL z+0(FP), DI |
106 |
| - MOVL x+12(FP), SI |
| 149 | +TEXT ·rshVU(SB), NOSPLIT, $0 |
| 150 | + MOVL z_len+4(FP), BX |
| 151 | + TESTL BX, BX; JZ ret0 |
107 | 152 | MOVL s+24(FP), CX
|
108 |
| - MOVL (SI), AX // w1 = x[0] |
| 153 | + MOVL x_base+12(FP), SI |
| 154 | + MOVL z_base+0(FP), DI |
| 155 | + // use counter as negative index |
| 156 | + LEAL (SI)(BX*4), SI |
| 157 | + LEAL (DI)(BX*4), DI |
| 158 | + NEGL BX |
| 159 | + // shift first word into carry |
| 160 | + MOVL 0(SI)(BX*4), BP |
109 | 161 | MOVL $0, DX
|
110 |
| - SHRL CX, AX, DX // w1<<ŝ |
| 162 | + SHRL CX, BP, DX |
111 | 163 | MOVL DX, c+28(FP)
|
112 |
| - |
113 |
| - MOVL $0, BX // i = 0 |
114 |
| - JMP E9 |
115 |
| - |
116 |
| - // i < n-1 |
117 |
| -L9: MOVL AX, DX // w = w1 |
118 |
| - MOVL 4(SI)(BX*4), AX // w1 = x[i+1] |
119 |
| - SHRL CX, AX, DX // w>>s | w1<<ŝ |
120 |
| - MOVL DX, (DI)(BX*4) // z[i] = w>>s | w1<<ŝ |
121 |
| - ADDL $1, BX // i++ |
122 |
| - |
123 |
| -E9: CMPL BX, BP |
124 |
| - JL L9 // i < n-1 |
125 |
| - |
126 |
| - // i >= n-1 |
127 |
| -X9a: SHRL CX, AX // w1>>s |
128 |
| - MOVL AX, (DI)(BP*4) // z[n-1] = w1>>s |
| 164 | + // shift remaining words |
| 165 | + ADDL $1, BX |
| 166 | +loop1: |
| 167 | + TESTL BX, BX; JZ loop1done |
| 168 | +loop1cont: |
| 169 | + // unroll 1X in batches of 1 |
| 170 | + MOVL 0(SI)(BX*4), DX |
| 171 | + SHRL CX, DX, BP |
| 172 | + MOVL BP, -4(DI)(BX*4) |
| 173 | + MOVL DX, BP |
| 174 | + ADDL $1, BX; JNZ loop1cont |
| 175 | +loop1done: |
| 176 | + // store final shifted bits |
| 177 | + SHRL CX, BP |
| 178 | + MOVL BP, -4(DI)(BX*4) |
129 | 179 | RET
|
130 |
| - |
131 |
| -X9b: MOVL $0, c+28(FP) |
| 180 | +ret0: |
| 181 | + MOVL $0, c+28(FP) |
132 | 182 | RET
|
133 | 183 |
|
134 |
| - |
135 | 184 | // func mulAddVWW(z, x []Word, m, a Word) (c Word)
|
136 |
| -TEXT ·mulAddVWW(SB),NOSPLIT,$0 |
137 |
| - MOVL z+0(FP), DI |
138 |
| - MOVL x+12(FP), SI |
139 |
| - MOVL m+24(FP), BP |
140 |
| - MOVL a+28(FP), CX // c = a |
141 |
| - MOVL z_len+4(FP), BX |
142 |
| - LEAL (DI)(BX*4), DI |
143 |
| - LEAL (SI)(BX*4), SI |
144 |
| - NEGL BX // i = -n |
145 |
| - JMP E5 |
146 |
| - |
147 |
| -L5: MOVL (SI)(BX*4), AX |
148 |
| - MULL BP |
149 |
| - ADDL CX, AX |
150 |
| - ADCL $0, DX |
151 |
| - MOVL AX, (DI)(BX*4) |
152 |
| - MOVL DX, CX |
153 |
| - ADDL $1, BX // i++ |
154 |
| - |
155 |
| -E5: CMPL BX, $0 // i < 0 |
156 |
| - JL L5 |
157 |
| - |
158 |
| - MOVL CX, c+32(FP) |
| 185 | +TEXT ·mulAddVWW(SB), NOSPLIT, $0 |
| 186 | + MOVL m+24(FP), BX |
| 187 | + MOVL a+28(FP), SI |
| 188 | + MOVL z_len+4(FP), DI |
| 189 | + MOVL x_base+12(FP), BP |
| 190 | + MOVL z_base+0(FP), CX |
| 191 | + // use counter as negative index |
| 192 | + LEAL (BP)(DI*4), BP |
| 193 | + LEAL (CX)(DI*4), CX |
| 194 | + NEGL DI |
| 195 | +loop1: |
| 196 | + TESTL DI, DI; JZ loop1done |
| 197 | +loop1cont: |
| 198 | + // unroll 1X in batches of 1 |
| 199 | + MOVL 0(BP)(DI*4), AX |
| 200 | + // multiply |
| 201 | + MULL BX |
| 202 | + ADDL SI, AX |
| 203 | + MOVL DX, SI |
| 204 | + ADCL $0, SI |
| 205 | + MOVL AX, 0(CX)(DI*4) |
| 206 | + ADDL $1, DI; JNZ loop1cont |
| 207 | +loop1done: |
| 208 | + MOVL SI, c+32(FP) |
159 | 209 | RET
|
160 | 210 |
|
161 |
| - |
162 | 211 | // func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
|
163 |
| -TEXT ·addMulVVWW(SB),NOSPLIT,$0 |
164 |
| - MOVL z+0(FP), BP |
165 |
| - MOVL x+12(FP), DI |
166 |
| - MOVL y+24(FP), SI |
167 |
| - MOVL a+40(FP), CX |
168 |
| - MOVL z_len+4(FP), BX |
169 |
| - LEAL (DI)(BX*4), DI |
170 |
| - LEAL (SI)(BX*4), SI |
171 |
| - LEAL (BP)(BX*4), BP |
172 |
| - NEGL BX // i = -n |
173 |
| - JMP E6 |
174 |
| - |
175 |
| -L6: MOVL (SI)(BX*4), AX |
| 212 | +TEXT ·addMulVVWW(SB), NOSPLIT, $0 |
| 213 | + MOVL a+40(FP), BX |
| 214 | + MOVL z_len+4(FP), SI |
| 215 | + MOVL x_base+12(FP), DI |
| 216 | + MOVL y_base+24(FP), BP |
| 217 | + MOVL z_base+0(FP), CX |
| 218 | + // use counter as negative index |
| 219 | + LEAL (DI)(SI*4), DI |
| 220 | + LEAL (BP)(SI*4), BP |
| 221 | + LEAL (CX)(SI*4), CX |
| 222 | + NEGL SI |
| 223 | +loop1: |
| 224 | + TESTL SI, SI; JZ loop1done |
| 225 | +loop1cont: |
| 226 | + // unroll 1X in batches of 1 |
| 227 | + MOVL 0(BP)(SI*4), AX |
| 228 | + // multiply |
176 | 229 | MULL m+36(FP)
|
177 |
| - ADDL CX, AX |
178 |
| - ADCL $0, DX |
179 |
| - ADDL (DI)(BX*4), AX |
180 |
| - MOVL AX, (BP)(BX*4) |
181 |
| - ADCL $0, DX |
182 |
| - MOVL DX, CX |
183 |
| - ADDL $1, BX // i++ |
184 |
| - |
185 |
| -E6: CMPL BX, $0 // i < 0 |
186 |
| - JL L6 |
187 |
| - |
188 |
| - MOVL CX, c+44(FP) |
| 230 | + ADDL BX, AX |
| 231 | + MOVL DX, BX |
| 232 | + ADCL $0, BX |
| 233 | + // add |
| 234 | + ADDL 0(DI)(SI*4), AX |
| 235 | + ADCL $0, BX |
| 236 | + MOVL AX, 0(CX)(SI*4) |
| 237 | + ADDL $1, SI; JNZ loop1cont |
| 238 | +loop1done: |
| 239 | + MOVL BX, c+44(FP) |
189 | 240 | RET
|
190 |
| - |
191 |
| - |
192 |
| - |
|
0 commit comments