Skip to content

Commit af8530c

Browse files
committed
Update documentation and remove bad solutions
1 parent 7f33ea6 commit af8530c

File tree

13 files changed

+155
-89
lines changed

13 files changed

+155
-89
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,3 +242,8 @@ tiny-gpu is setup to simulate the execution of both of the above kernels using `
242242
Running `make test_matadd` or `make test_matmul` will run the specified kernel and output a log file with the complete execution trace of the kernel from start to finish, as well as the intial and final states of data memory.
243243

244244
The `matadd` kernel adds 2 1x8 matrices across 8 threads running on 2 cores, and the `matmul` kernel multiplies 2 2x2 matrices across 4 threads.
245+
246+
# Notes
247+
248+
- Many things that could be wires are registers to make things explicitly synchronous and for code simplicity and clarity.
249+
- State management does some things in many cycles that could be done in 1 to make control flow explicit.

src/alu.sv

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
module alu (
1010
input wire clk,
1111
input wire reset,
12-
input wire enable,
12+
input wire enable, // If current block has less threads then block size, some ALUs will be inactive
1313

1414
input reg [2:0] core_state,
1515

@@ -35,9 +35,10 @@ module alu (
3535
// Calculate alu_out when core_state = EXECUTE
3636
if (core_state == 3'b101) begin
3737
if (decoded_alu_output_mux == 1) begin
38-
// Set values to compare with NZP register
38+
// Set values to compare with NZP register in alu_out[2:0]
3939
alu_out_reg <= {5'b0, (rs - rt > 0), (rs - rt == 0), (rs - rt < 0)};
4040
end else begin
41+
// Execute the specified arithmetic instruction
4142
case (decoded_alu_arithmetic_mux)
4243
ADD: begin
4344
alu_out_reg <= rs + rt;

src/controller.sv

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,30 +8,28 @@
88
module controller #(
99
parameter ADDR_BITS = 8,
1010
parameter DATA_BITS = 16,
11-
parameter NUM_CONSUMERS = 4,
12-
parameter NUM_CHANNELS = 1,
13-
parameter WRITE_ENABLE = 1
11+
parameter NUM_CONSUMERS = 4, // The number of consumers accessing memory through this controller
12+
parameter NUM_CHANNELS = 1, // The number of concurrent channels available to send requests to global memory
13+
parameter WRITE_ENABLE = 1 // Whether this memory controller can write to memory (program memory is read-only)
1414
) (
1515
input wire clk,
1616
input wire reset,
1717

18-
// LSU Interface
18+
// Consumer Interface (Fetchers / LSUs)
1919
input reg [NUM_CONSUMERS-1:0] consumer_read_valid,
2020
input reg [ADDR_BITS-1:0] consumer_read_address [NUM_CONSUMERS-1:0],
2121
output reg [NUM_CONSUMERS-1:0] consumer_read_ready,
2222
output reg [DATA_BITS-1:0] consumer_read_data [NUM_CONSUMERS-1:0],
23-
2423
input reg [NUM_CONSUMERS-1:0] consumer_write_valid,
2524
input reg [ADDR_BITS-1:0] consumer_write_address [NUM_CONSUMERS-1:0],
2625
input reg [DATA_BITS-1:0] consumer_write_data [NUM_CONSUMERS-1:0],
2726
output reg [NUM_CONSUMERS-1:0] consumer_write_ready,
2827

29-
// Memory Interface
28+
// Memory Interface (Data / Program)
3029
output reg [NUM_CHANNELS-1:0] mem_read_valid,
3130
output reg [ADDR_BITS-1:0] mem_read_address [NUM_CHANNELS-1:0],
3231
input reg [NUM_CHANNELS-1:0] mem_read_ready,
3332
input reg [DATA_BITS-1:0] mem_read_data [NUM_CHANNELS-1:0],
34-
3533
output reg [NUM_CHANNELS-1:0] mem_write_valid,
3634
output reg [ADDR_BITS-1:0] mem_write_address [NUM_CHANNELS-1:0],
3735
output reg [DATA_BITS-1:0] mem_write_data [NUM_CHANNELS-1:0],
@@ -43,11 +41,11 @@ module controller #(
4341
READ_RELAYING = 3'b100,
4442
WRITE_RELAYING = 3'b101;
4543

44+
// Keep track of state for each channel and which jobs each channel is handling
4645
reg [2:0] controller_state [NUM_CHANNELS-1:0];
47-
reg [$clog2(NUM_CONSUMERS)-1:0] current_consumer [NUM_CHANNELS-1:0];
48-
reg [NUM_CONSUMERS-1:0] channel_serving_consumer;
46+
reg [$clog2(NUM_CONSUMERS)-1:0] current_consumer [NUM_CHANNELS-1:0]; // Which consumer is each channel currently serving
47+
reg [NUM_CONSUMERS-1:0] channel_serving_consumer; // Which channels are being served? Prevents many workers from picking up the same request.
4948

50-
// TODO: read/write should be separate channels, and should handle multi-channel
5149
always @(posedge clk) begin
5250
if (reset) begin
5351
mem_read_valid <= 0;
@@ -63,11 +61,14 @@ module controller #(
6361

6462
current_consumer <= 0;
6563
controller_state <= 0;
66-
channel_serving_consumer <= 0;
64+
65+
channel_serving_consumer = 0;
6766
end else begin
67+
// For each channel, we handle processing concurrently
6868
for (int i = 0; i < NUM_CHANNELS; i = i + 1) begin
6969
case (controller_state[i])
7070
IDLE: begin
71+
// While this channel is idle, cycle through consumers looking for one with a pending request
7172
for (int j = 0; j < NUM_CONSUMERS; j = j + 1) begin
7273
if (consumer_read_valid[j] && !channel_serving_consumer[j]) begin
7374
channel_serving_consumer[j] = 1;
@@ -77,6 +78,7 @@ module controller #(
7778
mem_read_address[i] <= consumer_read_address[j];
7879
controller_state[i] <= READ_WAITING;
7980

81+
// Once we find a pending request, pick it up with this channel and stop looking for requests
8082
break;
8183
end else if (consumer_write_valid[j] && !channel_serving_consumer[j]) begin
8284
channel_serving_consumer[j] = 1;
@@ -87,6 +89,7 @@ module controller #(
8789
mem_write_data[i] <= consumer_write_data[j];
8890
controller_state[i] <= WRITE_WAITING;
8991

92+
// Once we find a pending request, pick it up with this channel and stop looking for requests
9093
break;
9194
end
9295
end
@@ -108,17 +111,17 @@ module controller #(
108111
controller_state[i] <= WRITE_RELAYING;
109112
end
110113
end
111-
// Wait until consumer acknowledges it received data, then reset
114+
// Wait until consumer acknowledges it received response, then reset
112115
READ_RELAYING: begin
113116
if (!consumer_read_valid[current_consumer[i]]) begin
114-
channel_serving_consumer[current_consumer[i]] <= 0;
117+
channel_serving_consumer[current_consumer[i]] = 0;
115118
consumer_read_ready[current_consumer[i]] <= 0;
116119
controller_state[i] <= IDLE;
117120
end
118121
end
119122
WRITE_RELAYING: begin
120123
if (!consumer_write_valid[current_consumer[i]]) begin
121-
channel_serving_consumer[current_consumer[i]] <= 0;
124+
channel_serving_consumer[current_consumer[i]] = 0;
122125
consumer_write_ready[current_consumer[i]] <= 0;
123126
controller_state[i] <= IDLE;
124127
end

src/core.sv

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,36 +14,37 @@ module core #(
1414
) (
1515
input wire clk,
1616
input wire reset,
17+
18+
// Kernel Execution
1719
input wire start,
1820
output wire done,
1921

20-
// METADATA
22+
// Block Metadata
2123
input wire [7:0] block_id,
2224
input wire [$clog2(THREADS_PER_BLOCK):0] thread_count,
2325

24-
// PROGRAM MEMORY
26+
// Program Memory
2527
output reg program_mem_read_valid,
2628
output reg [PROGRAM_MEM_ADDR_BITS-1:0] program_mem_read_address,
2729
input reg program_mem_read_ready,
2830
input reg [PROGRAM_MEM_DATA_BITS-1:0] program_mem_read_data,
2931

30-
// DATA MEMORY
32+
// Data Memory
3133
output reg [THREADS_PER_BLOCK-1:0] data_mem_read_valid,
3234
output reg [DATA_MEM_ADDR_BITS-1:0] data_mem_read_address [THREADS_PER_BLOCK-1:0],
3335
input reg [THREADS_PER_BLOCK-1:0] data_mem_read_ready,
3436
input reg [DATA_MEM_DATA_BITS-1:0] data_mem_read_data [THREADS_PER_BLOCK-1:0],
35-
3637
output reg [THREADS_PER_BLOCK-1:0] data_mem_write_valid,
3738
output reg [DATA_MEM_ADDR_BITS-1:0] data_mem_write_address [THREADS_PER_BLOCK-1:0],
3839
output reg [DATA_MEM_DATA_BITS-1:0] data_mem_write_data [THREADS_PER_BLOCK-1:0],
3940
input reg [THREADS_PER_BLOCK-1:0] data_mem_write_ready
4041
);
41-
// STATE
42+
// State
4243
reg [2:0] core_state;
4344
reg [2:0] fetcher_state;
4445
reg [15:0] instruction;
4546

46-
// EXECUTION
47+
// Intermediate Signals
4748
reg [7:0] current_pc;
4849
wire [7:0] next_pc[THREADS_PER_BLOCK-1:0];
4950
reg [7:0] rs[THREADS_PER_BLOCK-1:0];
@@ -52,12 +53,14 @@ module core #(
5253
reg [7:0] lsu_out[THREADS_PER_BLOCK-1:0];
5354
wire [7:0] alu_out[THREADS_PER_BLOCK-1:0];
5455

55-
// DECODER
56+
// Decoded Instruction Signals
5657
reg [3:0] decoded_rd_address;
5758
reg [3:0] decoded_rs_address;
5859
reg [3:0] decoded_rt_address;
5960
reg [2:0] decoded_nzp;
6061
reg [7:0] decoded_immediate;
62+
63+
// Decoded Control Signals
6164
reg decoded_reg_write_enable; // Enable writing to a register
6265
reg decoded_mem_read_enable; // Enable reading from memory
6366
reg decoded_mem_write_enable; // Enable writing to memory
@@ -68,6 +71,7 @@ module core #(
6871
reg decoded_pc_mux; // Select source of next PC
6972
reg decoded_ret;
7073

74+
// Fetcher
7175
fetcher #(
7276
.PROGRAM_MEM_ADDR_BITS(PROGRAM_MEM_ADDR_BITS),
7377
.PROGRAM_MEM_DATA_BITS(PROGRAM_MEM_DATA_BITS)
@@ -84,6 +88,7 @@ module core #(
8488
.instruction(instruction)
8589
);
8690

91+
// Decoder
8792
decoder decoder_instance (
8893
.clk(clk),
8994
.reset(reset),
@@ -105,6 +110,7 @@ module core #(
105110
.decoded_ret(decoded_ret)
106111
);
107112

113+
// Scheduler
108114
scheduler #(
109115
.THREADS_PER_BLOCK(THREADS_PER_BLOCK),
110116
) scheduler_instance (
@@ -122,9 +128,11 @@ module core #(
122128
.done(done)
123129
);
124130

131+
// Dedicated ALU, LSU, registers, & PC unit for each thread this core has capacity for
125132
genvar i;
126133
generate
127134
for (i = 0; i < THREADS_PER_BLOCK; i = i + 1) begin : threads
135+
// ALU
128136
alu alu_instance (
129137
.clk(clk),
130138
.reset(reset),
@@ -137,6 +145,7 @@ module core #(
137145
.alu_out(alu_out[i])
138146
);
139147

148+
// LSU
140149
lsu lsu_instance (
141150
.clk(clk),
142151
.reset(reset),
@@ -158,6 +167,7 @@ module core #(
158167
.lsu_out(lsu_out[i])
159168
);
160169

170+
// Register File
161171
registers #(
162172
.THREADS_PER_BLOCK(THREADS_PER_BLOCK),
163173
.THREAD_ID(i),
@@ -180,6 +190,7 @@ module core #(
180190
.rt(rt[i])
181191
);
182192

193+
// Program Counter
183194
pc #(
184195
.DATA_MEM_DATA_BITS(DATA_MEM_DATA_BITS),
185196
.PROGRAM_MEM_ADDR_BITS(PROGRAM_MEM_ADDR_BITS)

src/dcr.sv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ module dcr (
1212
input wire [7:0] device_control_data,
1313
output wire [7:0] thread_count,
1414
);
15+
// Store device control data in dedicated register
1516
reg [7:0] device_conrol_register;
1617
assign thread_count = device_conrol_register[7:0];
1718

src/decoder.sv

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,14 @@ module decoder (
1111
input reg [2:0] core_state,
1212
input reg [15:0] instruction,
1313

14-
// Values
14+
// Instruction Signals
1515
output reg [3:0] decoded_rd_address,
1616
output reg [3:0] decoded_rs_address,
1717
output reg [3:0] decoded_rt_address,
1818
output reg [2:0] decoded_nzp,
1919
output reg [7:0] decoded_immediate,
2020

21-
// Signals
21+
// Control Signals
2222
output reg decoded_reg_write_enable, // Enable writing to a register
2323
output reg decoded_mem_read_enable, // Enable reading from memory
2424
output reg decoded_mem_write_enable, // Enable writing to memory
@@ -28,7 +28,7 @@ module decoder (
2828
output reg decoded_alu_output_mux, // Select operation in ALU
2929
output reg decoded_pc_mux, // Select source of next PC
3030

31-
// Done
31+
// Return (finished executing thread)
3232
output reg decoded_ret
3333
);
3434
localparam NOP = 4'b0000,
@@ -62,12 +62,14 @@ module decoder (
6262
end else begin
6363
// Decode when core_state = DECODE
6464
if (core_state == 3'b010) begin
65+
// Get instruction signals from instruction every time
6566
decoded_rd_address <= instruction[11:8];
6667
decoded_rs_address <= instruction[7:4];
6768
decoded_rt_address <= instruction[3:0];
6869
decoded_immediate <= instruction[7:0];
6970
decoded_nzp <= instruction[11:9];
7071

72+
// Control signals reset on every decode and set conditionally by instruction
7173
decoded_reg_write_enable <= 0;
7274
decoded_mem_read_enable <= 0;
7375
decoded_mem_write_enable <= 0;
@@ -78,6 +80,7 @@ module decoder (
7880
decoded_pc_mux <= 0;
7981
decoded_ret <= 0;
8082

83+
// Set the control signals for each instruction
8184
case (instruction[15:12])
8285
NOP: begin
8386
// no-op

src/dispatch.sv

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,26 +13,33 @@ module dispatch #(
1313
input wire reset,
1414
input wire start,
1515

16+
// Kernel Metadata
1617
input wire [7:0] thread_count,
18+
19+
// Core States
1720
input reg [NUM_CORES-1:0] core_done,
1821
output reg [NUM_CORES-1:0] core_start,
1922
output reg [NUM_CORES-1:0] core_reset,
2023
output reg [7:0] core_block_id [NUM_CORES-1:0],
2124
output reg [$clog2(THREADS_PER_BLOCK):0] core_thread_count [NUM_CORES-1:0],
25+
26+
// Kernel Execution
2227
output reg done
2328
);
29+
// Calculate the total number of blocks based on total threads & threads per block
2430
wire [7:0] total_blocks;
2531
assign total_blocks = (thread_count + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
2632

27-
reg [7:0] blocks_dispatched;
28-
reg [7:0] blocks_done;
29-
reg start_execution;
33+
// Keep track of how many blocks have been processed
34+
reg [7:0] blocks_dispatched; // How many blocks have been sent to cores?
35+
reg [7:0] blocks_done; // How many blocks have finished processing?
36+
reg start_execution; // EDA: Unimportant hack used because of EDA tooling
3037

3138
always @(posedge clk) begin
3239
if (reset) begin
3340
done <= 0;
34-
blocks_dispatched <= 0;
35-
blocks_done <= 0;
41+
blocks_dispatched = 0;
42+
blocks_done = 0;
3643
start_execution <= 0;
3744

3845
for (int i = 0; i < NUM_CORES; i++) begin
@@ -42,15 +49,15 @@ module dispatch #(
4249
core_thread_count[i] <= THREADS_PER_BLOCK;
4350
end
4451
end else if (start) begin
45-
// Indirect way to get posedge start without driving from 2 cycles
46-
// TODO: Remove this ugly code
52+
// EDA: Indirect way to get @(posedge start) without driving from 2 different clocks
4753
if (!start_execution) begin
4854
start_execution <= 1;
4955
for (int i = 0; i < NUM_CORES; i++) begin
5056
core_reset[i] <= 1;
5157
end
5258
end
5359

60+
// If the last block has finished processing, mark this kernel as done executing
5461
if (blocks_done == total_blocks) begin
5562
done <= 1;
5663
end
@@ -67,22 +74,17 @@ module dispatch #(
6774
? thread_count - (blocks_dispatched * THREADS_PER_BLOCK)
6875
: THREADS_PER_BLOCK;
6976

70-
blocks_dispatched <= blocks_dispatched + 1;
71-
72-
// Only dispatch one block per clock cycle, so multiple cores can't pickup the same block
73-
break;
77+
blocks_dispatched = blocks_dispatched + 1;
7478
end
7579
end
7680
end
7781

7882
for (int i = 0; i < NUM_CORES; i++) begin
7983
if (core_start[i] && core_done[i]) begin
84+
// If a core just finished executing it's current block, reset it
8085
core_reset[i] <= 1;
8186
core_start[i] <= 0;
82-
blocks_done <= blocks_done + 1;
83-
84-
// Only update one block per cycle
85-
break;
87+
blocks_done = blocks_done + 1;
8688
end
8789
end
8890
end

0 commit comments

Comments
 (0)