|
3 | 3 | A minimal GPU implementation in Verilog optimized for learning about how GPUs work from the ground up.
|
4 | 4 |
|
5 | 5 | - [Overview]()
|
6 |
| -- [Architecture]() |
7 |
| -- [ISA]() |
8 |
| -- [SIMD]() |
9 |
| -- [Memory]() |
| 6 | +- [Architecture](/#architecture) |
| 7 | +- [ISA](/#isa) |
| 8 | +- [SIMD](/#simd) |
| 9 | +- [Memory](/#memory) |
10 | 10 | - [Kernels]()
|
11 | 11 | - [Simulation]()
|
12 | 12 |
|
@@ -56,3 +56,79 @@ For each topic, we'll first cover how tiny-gpu implements the fundamentals. Then
|
56 | 56 | # SIMD
|
57 | 57 |
|
58 | 58 | 
|
| 59 | + |
| 60 | +# Memory |
| 61 | + |
| 62 | +# Kernels |
| 63 | + |
| 64 | +```asm filename="matadd.asm" |
| 65 | +.threads 8 |
| 66 | +.data 0 1 2 3 4 5 6 7 ; matrix A (1 x 8) |
| 67 | +.data 0 1 2 3 4 5 6 7 ; matrix B (1 x 8) |
| 68 | +
|
| 69 | +MUL R0, %blockIdx, %blockDim |
| 70 | +ADD R0, R0, %threadIdx ; i = blockIdx * blockDim + threadIdx |
| 71 | +
|
| 72 | +CONST R1, #0 ; baseA (matrix A base address) |
| 73 | +CONST R2, #8 ; baseB (matrix B base address) |
| 74 | +CONST R3, #16 ; baseC (matrix C base address) |
| 75 | +
|
| 76 | +ADD R4, R1, R0 ; addr(A[i]) = baseA + i |
| 77 | +LDR R4, R4 ; load A[i] from global memory |
| 78 | +
|
| 79 | +ADD R5, R2, R0 ; addr(B[i]) = baseB + i |
| 80 | +LDR R5, R5 ; load B[i] from global memory |
| 81 | +
|
| 82 | +ADD R6, R4, R5 ; C[i] = A[i] + B[i] |
| 83 | +
|
| 84 | +ADD R7, R3, R0 ; addr(C[i]) = baseC + i |
| 85 | +STR R7, R6 ; store C[i] in global memory |
| 86 | +
|
| 87 | +RET ; end of kernel |
| 88 | +``` |
| 89 | + |
| 90 | +```asm filename="matmul.asm" |
| 91 | +.threads 4 |
| 92 | +.data 1 2 3 4 ; matrix A (2 x 2) |
| 93 | +.data 1 2 3 4 ; matrix B (2 x 2) |
| 94 | +
|
| 95 | +MUL R0, %blockIdx, %blockDim |
| 96 | +ADD R0, R0, %threadIdx ; i = blockIdx * blockDim + threadIdx |
| 97 | +
|
| 98 | +CONST R1, #1 ; increment |
| 99 | +CONST R2, #2 ; N (matrix inner dimension) |
| 100 | +CONST R3, #0 ; baseA (matrix A base address) |
| 101 | +CONST R4, #4 ; baseB (matrix B base address) |
| 102 | +CONST R5, #8 ; baseC (matrix C base address) |
| 103 | +
|
| 104 | +DIV R6, R0, R2 ; row = i // N |
| 105 | +MUL R7, R6, R2 |
| 106 | +SUB R7, R0, R7 ; col = i % N |
| 107 | +
|
| 108 | +CONST R8, #0 ; acc = 0 |
| 109 | +CONST R9, #0 ; k = 0 |
| 110 | +
|
| 111 | +LOOP: |
| 112 | + MUL R10, R6, R2 |
| 113 | + ADD R10, R10, R9 |
| 114 | + ADD R10, R10, R3 ; addr(A[i]) = row * N + k + baseA |
| 115 | + LDR R10, R10 ; load A[i] from global memory |
| 116 | +
|
| 117 | + MUL R11, R9, R2 |
| 118 | + ADD R11, R11, R7 |
| 119 | + ADD R11, R11, R4 ; addr(B[i]) = k * N + col + baseB |
| 120 | + LDR R11, R11 ; load B[i] from global memory |
| 121 | +
|
| 122 | + MUL R12, R10, R11 |
| 123 | + ADD R8, R8, R12 ; acc = acc + A[i] * B[i] |
| 124 | +
|
| 125 | + ADD R9, R9, R1 ; increment k |
| 126 | +
|
| 127 | + CMP R9, R2 |
| 128 | + BRn LOOP ; loop while k < N |
| 129 | +
|
| 130 | +ADD R9, R5, R0 ; addr(C[i]) = baseC + i |
| 131 | +STR R9, R8 ; store C[i] in global memory |
| 132 | +
|
| 133 | +RET ; end of kernel |
| 134 | +``` |
0 commit comments