|
101 | 101 | k = te.reduce_axis((0, K), "k") |
102 | 102 | A = te.placeholder((M, K), name="A") |
103 | 103 | B = te.placeholder((K, N), name="B") |
104 | | -C = te.compute((M, N), lambda x, y: te.sum(A[x, k] * B[k, y], axis=k), name="C") |
| 104 | +C = te.compute((M, N), lambda m, n: te.sum(A[m, k] * B[k, n], axis=k), name="C") |
105 | 105 |
|
106 | 106 | # Default schedule |
107 | 107 | s = te.create_schedule(C.op) |
|
130 | 130 | # fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB (L1 data cache) |
131 | 131 |
|
132 | 132 | bn = 32 |
| 133 | +kfactor = 4 |
133 | 134 | s = te.create_schedule(C.op) |
134 | 135 |
|
135 | 136 | # Blocking by loop tiling |
136 | | -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) |
137 | | -(k,) = s[C].op.reduce_axis |
138 | | -ko, ki = s[C].split(k, factor=4) |
| 137 | +mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) |
| 138 | +(kaxis,) = s[C].op.reduce_axis |
| 139 | +ko, ki = s[C].split(kaxis, factor=kfactor) |
139 | 140 |
|
140 | 141 | # Hoist reduction domain outside the blocking loop |
141 | | -s[C].reorder(xo, yo, ko, ki, xi, yi) |
| 142 | +s[C].reorder(mo, no, ko, ki, mi, ni) |
142 | 143 |
|
143 | 144 | func = tvm.build(s, [A, B, C], target=target, name="mmult") |
144 | 145 | assert func |
|
162 | 163 | # ------------- |
163 | 164 | # Another important trick is vectorization. When the memory access pattern is uniform, |
164 | 165 | # the compiler can detect this pattern and pass the continuous memory to vector processor. In TVM, |
165 | | -# we can use `vectorize` interface to hint the compiler this pattern, so that we can accelerate it vastly. |
| 166 | +# we can use `vectorize` interface to hint the compiler this pattern, so that we can accelerate it |
| 167 | +# vastly. |
166 | 168 | # |
167 | 169 | # In this tutorial, we chose to vectorize the inner loop row data since it is cache friendly. |
168 | 170 |
|
169 | 171 | s = te.create_schedule(C.op) |
170 | | -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) |
171 | | -(k,) = s[C].op.reduce_axis |
172 | | -ko, ki = s[C].split(k, factor=4) |
| 172 | +mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) |
| 173 | +(kaxis,) = s[C].op.reduce_axis |
| 174 | +ko, ki = s[C].split(kaxis, factor=kfactor) |
173 | 175 |
|
174 | | -s[C].reorder(xo, yo, ko, ki, xi, yi) |
| 176 | +s[C].reorder(mo, no, ko, ki, mi, ni) |
175 | 177 |
|
176 | 178 | # Vectorization |
177 | | -s[C].vectorize(yi) |
| 179 | +s[C].vectorize(ni) |
178 | 180 |
|
179 | 181 | func = tvm.build(s, [A, B, C], target=target, name="mmult") |
180 | 182 | assert func |
|
194 | 196 | ################################################################################################### |
195 | 197 | # Loop Permutation |
196 | 198 | # ---------------- |
197 | | -# If we look at the above IR, we can see the inner loop row data is vectorized and |
198 | | -# B is transformed into PackedB. The traversal of PackedB is sequential now. |
199 | | -# So we will look at the access pattern of A. In current schedule, A is accessed column by column |
200 | | -# which is not cache friendly. If we change the nested loop order of ki and inner axes xi, |
| 199 | +# If we look at the above IR, we can see the inner loop row data is vectorized for both B and C. |
| 200 | +# Next we will look at the access pattern of A. In current schedule, A is accessed column by column |
| 201 | +# which is not cache friendly. If we change the nested loop order of ki and inner axes mi, |
201 | 202 | # the access pattern for A matrix is more cache friendly. |
202 | 203 |
|
203 | 204 | s = te.create_schedule(C.op) |
204 | | -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) |
205 | | -(k,) = s[C].op.reduce_axis |
206 | | -ko, ki = s[C].split(k, factor=4) |
| 205 | +mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) |
| 206 | +(kaxis,) = s[C].op.reduce_axis |
| 207 | +ko, ki = s[C].split(kaxis, factor=kfactor) |
207 | 208 |
|
208 | 209 | # re-ordering |
209 | | -s[C].reorder(xo, yo, ko, xi, ki, yi) |
210 | | -s[C].vectorize(yi) |
| 210 | +s[C].reorder(mo, no, ko, mi, ki, ni) |
| 211 | +s[C].vectorize(ni) |
211 | 212 |
|
212 | 213 | func = tvm.build(s, [A, B, C], target=target, name="mmult") |
213 | 214 | assert func |
|
227 | 228 | ################################################################################################### |
228 | 229 | # Array Packing |
229 | 230 | # ------------- |
230 | | -# Another important trick is array packing. This trick is to reorder the storage dimension of the |
231 | | -# array to convert the continuous access pattern on certain dimension to a sequential pattern after |
232 | | -# flattening. |
| 231 | +# Another important trick is array packing. The trick is to reorder the storage of a multi- |
| 232 | +# dimensional array so that it is accessed sequentially after it is flattened and stored in one- |
| 233 | +# dimensional memory. |
233 | 234 | # |
234 | 235 | # .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/array-packing.png |
235 | 236 | # :align: center |
236 | 237 | # |
| 238 | +# NOTE: This figure is a general illustration of how array packing works. |
237 | 239 |
|
238 | 240 |
|
239 | 241 | ################################################################################################### |
240 | | -# Just as it is shown in the figure above, after blocking the computations, we can observe the array |
241 | | -# access pattern of B (after flattening), which is regular but discontinuous. We expect that after |
242 | | -# some transformation we can get continuous access pattern. We can reorder a [16][16] array to |
243 | | -# a [16/4][16][4] array, so that the access pattern of B will be sequential when grabing |
244 | | -# the corresponding value from the packed array. |
245 | | -# |
| 242 | +# We can use array packing to address the access pattern for B. Observe the array access pattern of |
| 243 | +# B after flattening which is not sequential as we iterate over the K dimension. We can reorder B |
| 244 | +# with dimensions [K][N] so that it has dimensions [N/bn][K][bn] where bn is the blocking factor and |
| 245 | +# also the vector size for B in the inner loop. This reorder splits N into two dimensions --- |
| 246 | +# bigN (N/bn) and littleN (bn) --- and the new dimensions [N/bn][K][bn] match the indexing of B |
| 247 | +# from outer to inner loops (no, ko, ki, ni) resulting in a sequential access pattern for B after |
| 248 | +# flattening. |
| 249 | + |
246 | 250 |
|
247 | 251 | # We have to re-write the algorithm slightly. |
248 | | -packedB = te.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name="packedB") |
| 252 | +packedB = te.compute( |
| 253 | + (N / bn, K, bn), lambda bigN, k, littleN: B[k, bigN * bn + littleN], name="packedB" |
| 254 | +) |
249 | 255 | C = te.compute( |
250 | 256 | (M, N), |
251 | | - lambda x, y: te.sum(A[x, k] * packedB[y // bn, k, tvm.tir.indexmod(y, bn)], axis=k), |
| 257 | + lambda m, n: te.sum(A[m, k] * packedB[n // bn, k, tvm.tir.indexmod(n, bn)], axis=k), |
252 | 258 | name="C", |
253 | 259 | ) |
254 | 260 |
|
255 | 261 | s = te.create_schedule(C.op) |
256 | 262 |
|
257 | | -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) |
258 | | -(k,) = s[C].op.reduce_axis |
259 | | -ko, ki = s[C].split(k, factor=4) |
| 263 | +mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) |
| 264 | +(kaxis,) = s[C].op.reduce_axis |
| 265 | +ko, ki = s[C].split(kaxis, factor=kfactor) |
260 | 266 |
|
261 | | -s[C].reorder(xo, yo, ko, xi, ki, yi) |
262 | | -s[C].vectorize(yi) |
| 267 | +s[C].reorder(mo, no, ko, mi, ki, ni) |
| 268 | +s[C].vectorize(ni) |
263 | 269 |
|
264 | | -x, y, z = s[packedB].op.axis |
265 | | -s[packedB].vectorize(z) |
266 | | -s[packedB].parallel(x) |
| 270 | +bigN, _, littleN = s[packedB].op.axis |
| 271 | +s[packedB].vectorize(littleN) |
| 272 | +s[packedB].parallel(bigN) |
267 | 273 |
|
268 | 274 | func = tvm.build(s, [A, B, C], target=target, name="mmult") |
269 | 275 | assert func |
|
293 | 299 | # Allocate write cache |
294 | 300 | CC = s.cache_write(C, "global") |
295 | 301 |
|
296 | | -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) |
| 302 | +mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) |
297 | 303 |
|
298 | | -# Write cache is computed at yo |
299 | | -s[CC].compute_at(s[C], yo) |
| 304 | +# Write cache is computed at no |
| 305 | +s[CC].compute_at(s[C], no) |
300 | 306 |
|
301 | 307 | # New inner axes |
302 | | -xc, yc = s[CC].op.axis |
| 308 | +mc, nc = s[CC].op.axis |
| 309 | + |
| 310 | +(kaxis,) = s[CC].op.reduce_axis |
| 311 | +ko, ki = s[CC].split(kaxis, factor=kfactor) |
| 312 | +s[CC].reorder(ko, mc, ki, nc) |
| 313 | +s[CC].vectorize(nc) |
303 | 314 |
|
304 | | -(k,) = s[CC].op.reduce_axis |
305 | | -ko, ki = s[CC].split(k, factor=4) |
306 | | -s[CC].reorder(ko, xc, ki, yc) |
| 315 | +# TODO: Add separate optimization step to discuss loop unrolloing |
| 316 | +# unrolling is a loop optimization strategy which can reduce branch |
| 317 | +# prediction failures and increases the chance of concurrent execution |
| 318 | +# unroll kfactor loops |
307 | 319 | s[CC].unroll(ki) |
308 | | -s[CC].vectorize(yc) |
309 | 320 |
|
310 | | -x, y, z = s[packedB].op.axis |
311 | | -s[packedB].vectorize(z) |
312 | | -s[packedB].parallel(x) |
| 321 | +bigN, _, littleN = s[packedB].op.axis |
| 322 | +s[packedB].vectorize(littleN) |
| 323 | +s[packedB].parallel(bigN) |
313 | 324 |
|
314 | 325 | func = tvm.build(s, [A, B, C], target=target, name="mmult") |
315 | 326 | assert func |
|
335 | 346 |
|
336 | 347 | CC = s.cache_write(C, "global") |
337 | 348 |
|
338 | | -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) |
| 349 | +mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) |
339 | 350 |
|
340 | | -s[CC].compute_at(s[C], yo) |
| 351 | +s[CC].compute_at(s[C], no) |
341 | 352 |
|
342 | | -xc, yc = s[CC].op.axis |
| 353 | +mc, nc = s[CC].op.axis |
343 | 354 |
|
344 | | -(k,) = s[CC].op.reduce_axis |
345 | | -ko, ki = s[CC].split(k, factor=4) |
346 | | -s[CC].reorder(ko, xc, ki, yc) |
| 355 | +(kaxis,) = s[CC].op.reduce_axis |
| 356 | +ko, ki = s[CC].split(kaxis, factor=kfactor) |
| 357 | +s[CC].reorder(ko, mc, ki, nc) |
| 358 | +s[CC].vectorize(nc) |
347 | 359 | s[CC].unroll(ki) |
348 | | -s[CC].vectorize(yc) |
349 | 360 |
|
350 | 361 | # parallel |
351 | | -s[C].parallel(xo) |
| 362 | +s[C].parallel(mo) |
352 | 363 |
|
353 | | -x, y, z = s[packedB].op.axis |
354 | | -s[packedB].vectorize(z) |
355 | | -s[packedB].parallel(x) |
| 364 | +bigN, _, littleN = s[packedB].op.axis |
| 365 | +s[packedB].vectorize(littleN) |
| 366 | +s[packedB].parallel(bigN) |
356 | 367 |
|
357 | 368 | func = tvm.build(s, [A, B, C], target=target, name="mmult") |
358 | 369 | assert func |
|
0 commit comments