Skip to content

Commit 5efe9dc

Browse files
committed
Update thread execution diagram
1 parent 5d05a5e commit 5efe9dc

File tree

1 file changed

+80
-4
lines changed

1 file changed

+80
-4
lines changed

README.md

Lines changed: 80 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33
A minimal GPU implementation in Verilog optimized for learning about how GPUs work from the ground up.
44

55
- [Overview]()
6-
- [Architecture]()
7-
- [ISA]()
8-
- [SIMD]()
9-
- [Memory]()
6+
- [Architecture](/#architecture)
7+
- [ISA](/#isa)
8+
- [SIMD](/#simd)
9+
- [Memory](/#memory)
1010
- [Kernels]()
1111
- [Simulation]()
1212

@@ -56,3 +56,79 @@ For each topic, we'll first cover how tiny-gpu implements the fundamentals. Then
5656
# SIMD
5757

5858
![Thread](/docs/images/thread.png)
59+
60+
# Memory
61+
62+
# Kernels
63+
64+
```asm filename="matadd.asm"
65+
.threads 8
66+
.data 0 1 2 3 4 5 6 7 ; matrix A (1 x 8)
67+
.data 0 1 2 3 4 5 6 7 ; matrix B (1 x 8)
68+
69+
MUL R0, %blockIdx, %blockDim
70+
ADD R0, R0, %threadIdx ; i = blockIdx * blockDim + threadIdx
71+
72+
CONST R1, #0 ; baseA (matrix A base address)
73+
CONST R2, #8 ; baseB (matrix B base address)
74+
CONST R3, #16 ; baseC (matrix C base address)
75+
76+
ADD R4, R1, R0 ; addr(A[i]) = baseA + i
77+
LDR R4, R4 ; load A[i] from global memory
78+
79+
ADD R5, R2, R0 ; addr(B[i]) = baseB + i
80+
LDR R5, R5 ; load B[i] from global memory
81+
82+
ADD R6, R4, R5 ; C[i] = A[i] + B[i]
83+
84+
ADD R7, R3, R0 ; addr(C[i]) = baseC + i
85+
STR R7, R6 ; store C[i] in global memory
86+
87+
RET ; end of kernel
88+
```
89+
90+
```asm filename="matmul.asm"
91+
.threads 4
92+
.data 1 2 3 4 ; matrix A (2 x 2)
93+
.data 1 2 3 4 ; matrix B (2 x 2)
94+
95+
MUL R0, %blockIdx, %blockDim
96+
ADD R0, R0, %threadIdx ; i = blockIdx * blockDim + threadIdx
97+
98+
CONST R1, #1 ; increment
99+
CONST R2, #2 ; N (matrix inner dimension)
100+
CONST R3, #0 ; baseA (matrix A base address)
101+
CONST R4, #4 ; baseB (matrix B base address)
102+
CONST R5, #8 ; baseC (matrix C base address)
103+
104+
DIV R6, R0, R2 ; row = i // N
105+
MUL R7, R6, R2
106+
SUB R7, R0, R7 ; col = i % N
107+
108+
CONST R8, #0 ; acc = 0
109+
CONST R9, #0 ; k = 0
110+
111+
LOOP:
112+
MUL R10, R6, R2
113+
ADD R10, R10, R9
114+
ADD R10, R10, R3 ; addr(A[i]) = row * N + k + baseA
115+
LDR R10, R10 ; load A[i] from global memory
116+
117+
MUL R11, R9, R2
118+
ADD R11, R11, R7
119+
ADD R11, R11, R4 ; addr(B[i]) = k * N + col + baseB
120+
LDR R11, R11 ; load B[i] from global memory
121+
122+
MUL R12, R10, R11
123+
ADD R8, R8, R12 ; acc = acc + A[i] * B[i]
124+
125+
ADD R9, R9, R1 ; increment k
126+
127+
CMP R9, R2
128+
BRn LOOP ; loop while k < N
129+
130+
ADD R9, R5, R0 ; addr(C[i]) = baseC + i
131+
STR R9, R8 ; store C[i] in global memory
132+
133+
RET ; end of kernel
134+
```

0 commit comments

Comments
 (0)