Skip to content

Commit f9b6e74

Browse files
committed
[VTA] Design refactoring and bug fixes (apache#20)
* updating driver and dll paths for latest pynqv2.3 image * unifying tcl scripting for multipleFPGA backends, making bus width parameterizable * fix coherent interface on Ultra96 * 2d padded load template * simplifying tensor load/store * streamlining GEMM and ALU pipeline * refactor VTA for simpler instruction decoding * fixed bug, mixed DSP/LUT GEMM support * adding latency directive in DSPs for better pipelining and timing closure * dual channel memory interface for Ultra96 * hardware bug fixes, working compilation on PYNQ and ULTRA96 * checking for correctness * support for pynq v2.3 * report inference time in ms * update CMAKE for Pynq v2.3 * testing hardware support for batch norm * adding support for multiplication in ALU * defaulting to coherent buffers in runtime * increasing size of CMA buffer allocation limit
1 parent daffce5 commit f9b6e74

File tree

23 files changed

+2388
-3822
lines changed

23 files changed

+2388
-3822
lines changed

cmake/modules/VTA.cmake

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,16 +38,9 @@ elseif(PYTHON)
3838
set_target_properties(vta PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
3939
endif(APPLE)
4040

41-
# PYNQ rules for Pynq v2.3
42-
if(${VTA_TARGET} STREQUAL "pynq")
43-
find_library(__cma_lib NAMES cma PATH /usr/lib)
44-
target_link_libraries(vta ${__cma_lib})
45-
endif()
46-
# Ultra96 rules
47-
if(${VTA_TARGET} STREQUAL "ultra96")
48-
find_library(__sds_lib NAMES sds_lib PATH /usr/lib)
49-
target_link_libraries(vta ${__sds_lib})
50-
endif()
41+
# PYNQ rules for pynq v2.3
42+
find_library(__cma_lib NAMES cma PATH /usr/lib)
43+
target_link_libraries(vta ${__cma_lib})
5144
else()
5245
message(STATUS "Cannot found python in env, VTA build is skipped..")
5346
endif()

vta/config/pynq_sample.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
"LOG_BATCH" : 0,
1515
"LOG_BLOCK_IN" : 4,
1616
"LOG_BLOCK_OUT" : 4,
17+
"LOG_BUS_WIDTH" : 6,
1718
"LOG_UOP_BUFF_SIZE" : 15,
1819
"LOG_INP_BUFF_SIZE" : 15,
1920
"LOG_WGT_BUFF_SIZE" : 18,

vta/config/ultra96_sample.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
"LOG_BATCH" : 0,
1515
"LOG_BLOCK_IN" : 4,
1616
"LOG_BLOCK_OUT" : 4,
17+
"LOG_BUS_WIDTH" : 7,
1718
"LOG_UOP_BUFF_SIZE" : 15,
1819
"LOG_INP_BUFF_SIZE" : 15,
1920
"LOG_WGT_BUFF_SIZE" : 18,

vta/config/vta_config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ def main():
6060
help="returns log of tensor block in dimension")
6161
parser.add_argument("--get-blockout", action="store_true",
6262
help="returns log of tensor block out dimension")
63+
parser.add_argument("--get-buswidth", action="store_true",
64+
help="returns log of bus width in b")
6365
parser.add_argument("--get-uopbuffsize", action="store_true",
6466
help="returns log of micro-op buffer size in B")
6567
parser.add_argument("--get-inpbuffsize", action="store_true",
@@ -182,6 +184,9 @@ def main():
182184
if args.get_blockout:
183185
print(cfg["LOG_BLOCK_OUT"])
184186

187+
if args.get_buswidth:
188+
print(cfg["LOG_BUS_WIDTH"])
189+
185190
if args.get_uopbuffsize:
186191
print(cfg["LOG_UOP_BUFF_SIZE"])
187192

vta/hardware/xilinx/Makefile

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ VIVADO = vivado
1414
HSI = hsi
1515

1616
# HLS mode
17-
MODE = skip_sim
17+
MODE = all
1818
# Debug flag
1919
DEBUG = False
2020
# SLURM
@@ -35,6 +35,7 @@ VTA_OUT_WIDTH := $(shell ${VTA_CONFIG} --get-outwidth)
3535
VTA_BATCH := $(shell ${VTA_CONFIG} --get-batch)
3636
VTA_IN_BLOCK := $(shell ${VTA_CONFIG} --get-blockin)
3737
VTA_OUT_BLOCK := $(shell ${VTA_CONFIG} --get-blockout)
38+
VTA_BUS_WIDTH := $(shell ${VTA_CONFIG} --get-buswidth)
3839
VTA_UOP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-uopbuffsize)
3940
VTA_INP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-inpbuffsize)
4041
VTA_WGT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-wgtbuffsize)
@@ -90,7 +91,7 @@ $(IP_PATH): $(SRC_DIR)/*
9091
$(MODE) $(DEBUG) $(VTA_ALU_EN) $(VTA_MUL_EN) \
9192
$(VTA_TARGET_PER) $(VTA_GEMM_II) $(VTA_TALU_II) \
9293
$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_ACC_WIDTH) $(VTA_OUT_WIDTH) \
93-
$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
94+
$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) $(VTA_BUS_WIDTH) \
9495
$(VTA_UOP_BUFF_SIZE) $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) \
9596
$(VTA_ACC_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
9697
ifeq ($(SLURM), True)
@@ -101,7 +102,7 @@ endif
101102
$(BIT_PATH): $(IP_PATH)
102103
mkdir -p $(HW_BUILD_PATH)
103104
cd $(HW_BUILD_PATH) && \
104-
$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/ultra96.tcl \
105+
$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
105106
-tclargs $(VTA_TARGET) $(BUILD_DIR)/hls/$(CONF) $(VTA_HW_COMP_THREADS) \
106107
$(VTA_CLOCK_FREQ) $(VTA_GEMM_II) \
107108
$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_OUT_WIDTH) \

vta/hardware/xilinx/scripts/hls.tcl

Lines changed: 34 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,14 @@
2424
# Arg 17: batch size (log)
2525
# Arg 18: in block size (log)
2626
# Arg 19: out block size (log)
27-
# Arg 20: uop buffer size in B (log)
28-
# Arg 21: inp buffer size in B (log)
29-
# Arg 22: wgt buffer size in B (log)
30-
# Arg 23: acc buffer size in B (log)
31-
# Arg 24: out buffer size in B (log)
32-
33-
if { [llength $argv] eq 26 } {
27+
# Arg 20: bus width in b (log)
28+
# Arg 21: uop buffer size in B (log)
29+
# Arg 22: inp buffer size in B (log)
30+
# Arg 23: wgt buffer size in B (log)
31+
# Arg 24: acc buffer size in B (log)
32+
# Arg 25: out buffer size in B (log)
33+
34+
if { [llength $argv] eq 27 } {
3435
set target [lindex $argv 2]
3536
set src_dir [lindex $argv 3]
3637
set sim_dir [lindex $argv 4]
@@ -50,43 +51,24 @@ if { [llength $argv] eq 26 } {
5051
set batch [lindex $argv 18]
5152
set block_in [lindex $argv 19]
5253
set block_out [lindex $argv 20]
53-
set uop_buff_size [lindex $argv 21]
54-
set inp_buff_size [lindex $argv 22]
55-
set wgt_buff_size [lindex $argv 23]
56-
set acc_buff_size [lindex $argv 24]
57-
set out_buff_size [lindex $argv 25]
54+
set bus_width [lindex $argv 21]
55+
set uop_buff_size [lindex $argv 22]
56+
set inp_buff_size [lindex $argv 23]
57+
set wgt_buff_size [lindex $argv 24]
58+
set acc_buff_size [lindex $argv 25]
59+
set out_buff_size [lindex $argv 26]
5860
} else {
59-
set target "pynq"
60-
set src_dir "../src"
61-
set sim_dir "../sim"
62-
set test_dir "../../src/test"
63-
set include_dir "../../include"
64-
set mode "all"
65-
set debug "False"
66-
set alu_ena "True"
67-
set mul_ena "True"
68-
set target_period 8
69-
set target_gemm_ii 10
70-
set target_alu_ii 16
71-
set inp_width 3
72-
set wgt_width 3
73-
set acc_width 5
74-
set out_width 3
75-
set batch 1
76-
set block_in 4
77-
set block_out 4
78-
set uop_buff_size 15
79-
set inp_buff_size 15
80-
set wgt_buff_size 15
81-
set acc_buff_size 17
82-
set out_buff_size 15
61+
puts "Not enough arguments provided!"
8362
exit
8463
}
8564

65+
puts "about to start doing some stuff"
66+
67+
8668
# Initializes the HLS design and sets HLS pragmas for memory partitioning.
8769
# This is necessary because of a Vivado restriction that doesn't allow for
8870
# buses wider than 1024 bits.
89-
proc init_design {target per g_ii a_ii inp_width wgt_width out_width acc_width batch block_in block_out alu_ena} {
71+
proc init_design {target per g_ii a_ii bus_width inp_width wgt_width out_width acc_width batch block_in block_out alu_ena} {
9072

9173
# Set device number
9274
if {$target=="pynq"} {
@@ -95,28 +77,25 @@ proc init_design {target per g_ii a_ii inp_width wgt_width out_width acc_width b
9577
set_part {xczu3eg-sbva484-1-e}
9678
} elseif {$target=="zcu102"} {
9779
set_part {xczu9eg-ffvb1156-2-e}
80+
} elseif {$target=="f1"} {
81+
set_part {xcvu9p-flgb2104-2-i}
82+
# config_interface -m_axi_addr64
9883
}
9984

10085
# Max bus width (supported by Vivado)
10186
set max_width 1024
10287

103-
# Set axi width (TODO derive from top level config)
104-
if {$target=="pynq"} {
105-
set axi_width 64
106-
} elseif {$target=="ultra96"} {
107-
set axi_width 128
108-
} elseif {$target=="zcu102"} {
109-
set axi_width 128
110-
}
88+
# Set axi width
89+
set axi_width [expr {1 << $bus_width}]
11190

11291
# Set the clock frequency
11392
create_clock -period $per -name default
11493

11594
# Set pipeline directive
116-
set_directive_pipeline -II $g_ii "compute/READ_GEMM_UOP"
95+
set_directive_pipeline -II $g_ii "gemm/READ_GEMM_UOP"
11796

11897
if {$alu_ena=="True"} {
119-
set_directive_pipeline -II $a_ii "compute/READ_ALU_UOP"
98+
set_directive_pipeline -II $a_ii "alu/READ_ALU_UOP"
12099
}
121100

122101
# Set input partition factor to (INP_VECTOR_WIDTH*BATCH/(1024*g_ii)
@@ -174,7 +153,8 @@ set cflags "-I $include_dir -I $src_dir -I $test_dir \
174153
-DVTA_LOG_BATCH=$batch -DVTA_LOG_BLOCK_OUT=$block_out -DVTA_LOG_BLOCK_IN=$block_in \
175154
-DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \
176155
-DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \
177-
-DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size"
156+
-DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size -DVTA_LOG_BUS_WIDTH=$bus_width \
157+
-DVTA_GEMM_II=$target_gemm_ii"
178158
if {$debug=="True"} {
179159
append cflags " -DVTA_DEBUG=1"
180160
}
@@ -185,6 +165,7 @@ if {$mul_ena=="True"} {
185165
append cflags " -DMUL_EN"
186166
}
187167

168+
188169
# HLS behavioral sim
189170
if {$mode=="all" || $mode=="sim"} {
190171
open_project vta_sim
@@ -193,7 +174,7 @@ if {$mode=="all" || $mode=="sim"} {
193174
add_files -tb $sim_dir/vta_test.cc -cflags $cflags
194175
add_files -tb $test_dir/test_lib.cc -cflags $cflags
195176
open_solution "solution0"
196-
init_design $target $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
177+
init_design $target $target_period $target_gemm_ii $target_alu_ii $bus_width $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
197178
csim_design -clean
198179
close_project
199180
}
@@ -204,7 +185,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="fetch"} {
204185
set_top fetch
205186
add_files $src_dir/vta.cc -cflags $cflags
206187
open_solution "solution0"
207-
init_design $target $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
188+
init_design $target $target_period $target_gemm_ii $target_alu_ii $bus_width $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
208189
csynth_design
209190
if {$mode=="all" || $mode=="skip_sim"} {
210191
export_design -format ip_catalog
@@ -218,7 +199,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="load"} {
218199
set_top load
219200
add_files $src_dir/vta.cc -cflags $cflags
220201
open_solution "solution0"
221-
init_design $target $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
202+
init_design $target $target_period $target_gemm_ii $target_alu_ii $bus_width $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
222203
csynth_design
223204
if {$mode=="all" || $mode=="skip_sim"} {
224205
export_design -format ip_catalog
@@ -232,7 +213,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="compute"} {
232213
set_top compute
233214
add_files $src_dir/vta.cc -cflags $cflags
234215
open_solution "solution0"
235-
init_design $target $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
216+
init_design $target $target_period $target_gemm_ii $target_alu_ii $bus_width $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
236217
csynth_design
237218
if {$mode=="all" || $mode=="skip_sim"} {
238219
export_design -format ip_catalog
@@ -246,7 +227,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="store"} {
246227
set_top store
247228
add_files $src_dir/vta.cc -cflags $cflags
248229
open_solution "solution0"
249-
init_design $target $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
230+
init_design $target $target_period $target_gemm_ii $target_alu_ii $bus_width $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
250231
csynth_design
251232
if {$mode=="all" || $mode=="skip_sim"} {
252233
export_design -format ip_catalog

0 commit comments

Comments
 (0)