Skip to content

Commit 578876f

Browse files
committed
further eval int16 128b
1 parent 04dec14 commit 578876f

File tree

11 files changed

+1763
-48
lines changed

11 files changed

+1763
-48
lines changed

README.md

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -79,24 +79,26 @@ EdgeBoard(ZU3EG): 1.2GHz A53 4 cores + 4GiB DDR4 + FPGA
7979

8080
| ID | DataType | hls_target_clk |Tn/Tm/Tr/Tc/II_CONV/II_POOL/PP_I+W,O | DSP | BRAM | LUT | FF | Freq (MHz) | Dev | ref repo|
8181
| --- | --- | --- | --- |--- | --- | --- | --- | --- |--- | ---|
82-
| A |FT32 | 3.0| 4/28/26/32/3/3/1+1,1 | 259(72%) | 90.5(42%) | 31983(45%) | 57683(41%) | 200 |EdgeBoard(ZU3EG)| 02_FT32|
83-
| B |FT32 | 3.0| 4/28/26/32/3/3/4&4,2 | 334(93%) | 109.0(50%) | 44855(64%) | 78699(56%) | 190 |EdgeBoard(ZU3EG)| 02_FT32_mp_r4w2|
82+
| A |FT32 | 3.0| 4/28/26/32/3/3/1+1,1 | 259(72%) | 90.5(42%) | 31983(45%) | 57683(41%) | 200 |EdgeBoard(ZU3EG)| 02_FT32|
83+
| B |FT32 | 3.0| 4/36/26/32/3/3/4&4,2 | 334(93%) | 109.0(50%) | 44855(64%) | 78699(56%) | 190 |EdgeBoard(ZU3EG)| 02_FT32_mp_r4w2|
84+
| C |INT16 | 3.0| 8/24/26/26/1/1/1&1,1 | 253(70%) | 88.0(41%) | 50447(71%) | 25249(18%) | 190 |EdgeBoard(ZU3EG)| 02_INT16_128b|
8485

8586
*PP_I+W,O, means that parallel data ports in accelerator interface; In Design A, [1+1,1] represents that ifm and weight own independent port (+ means or).
8687
In Design B, [4&4, 2] represents that ifm and weight buffers share same 4 ports, and ofm buffers own 2 concurrent write-back ports.
8788

8889

89-
|ID | A | B |
90-
|---|---|---|
91-
|CNN models |YOLO v2 |YOLO v2 |
92-
|Board | ZU3EG | ZU3EG |
93-
|Acc-Clock(MHz) | 200 | 190 |
94-
|Precision | FT32 | FT32 |
95-
|Power (cpu idle + static fpga + dynamic cpu & fpga, W) | 6.63 + 0.55 + 1.82| 6.63 + 0.70 + 2.23|
96-
|Operations (GOP) |29.472 |29.472 |
97-
|Latency* (s) | 2.255 |1.801|
98-
|Performance(GOP/s) |13.069 |16.364|
99-
|Power Efficiency(GOP/s/W) | 5.514 |5.585|
90+
|ID | A | B | C|
91+
|---|---|---|---|
92+
|CNN models |YOLO v2 |YOLO v2 |YOLO v2 |
93+
|Board | ZU3EG | ZU3EG | ZU3EG |
94+
|Acc-Clock(MHz) | 200 | 190 | 190 |
95+
|current/available Bit_DataBus (bit) | 32/128 | 32/128 | 128/128 |
96+
|Precision | FT32 | FT32 | INT16 |
97+
|Power (cpu idle + static fpga + dynamic cpu & fpga, W) | 6.63 + 0.55 + 1.82| 6.63 + 0.70 + 2.23| 6.63 + 0.27 + 0.77|
98+
|Operations (GOP) |29.472 |29.472 |29.472 |
99+
|Latency* (s) | 2.255 |1.801| 0.475|
100+
|Performance(GOP/s) |13.069 |16.364|62.020|
101+
|Power Efficiency(GOP/s/W) | 5.514 |5.585|59.634|
100102

101103
*Latency did not include post-process stage (e.g., the last region layer and image saving procedure) in CPU. Power Efficiency only evaluates the static + dynamic power in FPGA & CPU. CPU power could be further improved to close useless module and bus.
102104

SDK/src_float32/yolov2_acc_sim.h

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -230,12 +230,24 @@ void yolov2_hls_ps(network *net, float *input)
230230
// TC = MIN(((OnChipIB_Width-l.size)/l.stride+1),Tc);
231231
// TC = MIN(output_w,TC);
232232

233-
TC = MIN(((IB_HxW-l.size)/l.stride+1),output_w);
233+
// TC = MIN(((IB_HxW-l.size)/l.stride+1),output_w);
234+
// TCol = (TC-1)*l.stride + l.size;
235+
// TR = MIN(((IB_HxW/TCol-l.size)/l.stride+1),output_h);//keep Kernel_stride>=1
236+
// TR = MIN(TR, TrxTc/TC);
237+
// TRow = (TR-1)*l.stride + l.size;
238+
239+
assert((IB_HxW/l.size)>=l.size);
240+
TC = MIN(((IB_HxW/l.size-l.size)/l.stride+1),output_w);
241+
TC = MIN(TrxTc, TC);
234242
TCol = (TC-1)*l.stride + l.size;
235243
TR = MIN(((IB_HxW/TCol-l.size)/l.stride+1),output_h);//keep Kernel_stride>=1
236244
TR = MIN(TR, TrxTc/TC);
237245
TRow = (TR-1)*l.stride + l.size;
238246

247+
// assert(((TR*TC)>0)&&((TR*TC)<=TrxTc));
248+
// assert(((TRow*TCol)>0)&&((TRow*TCol)<=IB_HxW));
249+
// printf("TR=%d, TC=%d, TRow=%d, TCol=%d\n", TR, TC, TRow, TCol);
250+
239251
TM = MIN(l.n,Tm);
240252
TN = MIN(l.c,Tn);
241253

@@ -275,11 +287,19 @@ void yolov2_hls_ps(network *net, float *input)
275287
// TR = MIN(output_h,TR);
276288
// TC = MIN(output_w,TC);
277289

278-
TC = MIN(((IB_HxW-l.size)/l.stride+1),output_w);
290+
// TC = MIN(((IB_HxW-l.size)/l.stride+1),output_w);
291+
// TCol = (TC-1)*l.stride + l.size;
292+
// TR = MIN(((IB_HxW/TCol-l.size)/l.stride+1),output_h);//keep Kernel_stride>=1
293+
// TR = MIN(TR, TrxTc/TC);
294+
// TRow = (TR-1)*l.stride + l.size;
295+
296+
assert((IB_HxW/l.size)>=l.size);
297+
TC = MIN(((IB_HxW/l.size-l.size)/l.stride+1),output_w);
298+
TC = MIN(TrxTc, TC);
279299
TCol = (TC-1)*l.stride + l.size;
280300
TR = MIN(((IB_HxW/TCol-l.size)/l.stride+1),output_h);//keep Kernel_stride>=1
281301
TR = MIN(TR, TrxTc/TC);
282-
TRow = (TR-1)*l.stride + l.size;
302+
TRow = (TR-1)*l.stride + l.size;
283303

284304
TM = MIN(Tm,Tn);
285305
TM = MIN(l.c,TM);

SDK/src_float32_mp/yolov2_acc_sim.h

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -239,11 +239,19 @@ void yolov2_hls_ps(network *net, float *input)
239239
// TC = MIN(((OnChipIB_Width-l.size)/l.stride+1),Tc);
240240
// TC = MIN(output_w,TC);
241241

242-
TC = MIN(((IB_HxW-l.size)/l.stride+1),output_w);
242+
// TC = MIN(((IB_HxW-l.size)/l.stride+1),output_w);
243+
// TCol = (TC-1)*l.stride + l.size;
244+
// TR = MIN(((IB_HxW/TCol-l.size)/l.stride+1),output_h);//keep Kernel_stride>=1
245+
// TR = MIN(TR, TrxTc/TC);
246+
// TRow = (TR-1)*l.stride + l.size;
247+
248+
assert((IB_HxW/l.size)>=l.size);
249+
TC = MIN(((IB_HxW/l.size-l.size)/l.stride+1),output_w);
250+
TC = MIN(TrxTc, TC);
243251
TCol = (TC-1)*l.stride + l.size;
244252
TR = MIN(((IB_HxW/TCol-l.size)/l.stride+1),output_h);//keep Kernel_stride>=1
245253
TR = MIN(TR, TrxTc/TC);
246-
TRow = (TR-1)*l.stride + l.size;
254+
TRow = (TR-1)*l.stride + l.size;
247255

248256
TM = MIN(l.n,Tm);
249257
TN = MIN(l.c,Tn);
@@ -284,11 +292,19 @@ void yolov2_hls_ps(network *net, float *input)
284292
// TR = MIN(output_h,TR);
285293
// TC = MIN(output_w,TC);
286294

287-
TC = MIN(((IB_HxW-l.size)/l.stride+1),output_w);
295+
// TC = MIN(((IB_HxW-l.size)/l.stride+1),output_w);
296+
// TCol = (TC-1)*l.stride + l.size;
297+
// TR = MIN(((IB_HxW/TCol-l.size)/l.stride+1),output_h);//keep Kernel_stride>=1
298+
// TR = MIN(TR, TrxTc/TC);
299+
// TRow = (TR-1)*l.stride + l.size;
300+
301+
assert((IB_HxW/l.size)>=l.size);
302+
TC = MIN(((IB_HxW/l.size-l.size)/l.stride+1),output_w);
303+
TC = MIN(TrxTc, TC);
288304
TCol = (TC-1)*l.stride + l.size;
289305
TR = MIN(((IB_HxW/TCol-l.size)/l.stride+1),output_h);//keep Kernel_stride>=1
290306
TR = MIN(TR, TrxTc/TC);
291-
TRow = (TR-1)*l.stride + l.size;
307+
TRow = (TR-1)*l.stride + l.size;
292308

293309
TM = MIN(Tm,Tn);
294310
TM = MIN(l.c,TM);

SDK/src_int16_128b/yolov2_acc_i16c_test.h

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -313,15 +313,25 @@ void yolov2_hls_ps(network *net, float *input)
313313
uint16_t ofm_w = l.out_w;
314314
uint16_t ofm_h = l.out_h;
315315

316+
// uint16_t TR,TC,TM,TN;
317+
// TC = MIN_diy(((IB_HxW-kernel_size)/kernel_stride+1),ofm_w);
318+
// uint16_t TCol = (TC-1)*kernel_stride + kernel_size;
319+
// TR = MIN_diy(((IB_HxW/TCol-kernel_size)/kernel_stride+1),ofm_h);//keep Kernel_stride>=1
320+
// TR = MIN_diy(TR, TrxTc/TC);
321+
// uint16_t TRow = (TR-1)*kernel_stride + kernel_size;
322+
// assert(((TR*TC)>0)&&((TR*TC)<=TrxTc));
323+
// assert(((TRow*TCol)>0)&&((TRow*TCol)<=IB_HxW));
324+
// // printf("TR=%d, TC=%d, TRow=%d, TCol=%d\n", TR, TC, TRow, TCol);
325+
316326
uint16_t TR,TC,TM,TN;
317-
TC = MIN_diy(((IB_HxW-kernel_size)/kernel_stride+1),ofm_w);
318-
uint16_t TCol = (TC-1)*kernel_stride + kernel_size;
319-
TR = MIN_diy(((IB_HxW/TCol-kernel_size)/kernel_stride+1),ofm_h);//keep Kernel_stride>=1
327+
uint16_t TRow, TCol;
328+
assert((IB_HxW/l.size)>=l.size);
329+
TC = MIN_diy(((IB_HxW/l.size-l.size)/l.stride+1),ofm_w);
330+
TC = MIN_diy(TrxTc, TC);
331+
TCol = (TC-1)*l.stride + l.size;
332+
TR = MIN_diy(((IB_HxW/TCol-l.size)/l.stride+1),ofm_h);//keep Kernel_stride>=1
320333
TR = MIN_diy(TR, TrxTc/TC);
321-
uint16_t TRow = (TR-1)*kernel_stride + kernel_size;
322-
assert(((TR*TC)>0)&&((TR*TC)<=TrxTc));
323-
assert(((TRow*TCol)>0)&&((TRow*TCol)<=IB_HxW));
324-
// printf("TR=%d, TC=%d, TRow=%d, TCol=%d\n", TR, TC, TRow, TCol);
334+
TRow = (TR-1)*l.stride + l.size;
325335

326336
TM = MIN_diy(ofm_num,Tm);
327337
TN = MIN_diy(ifm_num,Tn);

hls/src_int16_128b/yolov2_acc_i16c_test.h

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -303,15 +303,25 @@ void yolov2_hls_ps(network *net, float *input)
303303
uint16_t ofm_w = l.out_w;
304304
uint16_t ofm_h = l.out_h;
305305

306+
// uint16_t TR,TC,TM,TN;
307+
// TC = MIN_diy(((IB_HxW-kernel_size)/kernel_stride+1),ofm_w);
308+
// uint16_t TCol = (TC-1)*kernel_stride + kernel_size;
309+
// TR = MIN_diy(((IB_HxW/TCol-kernel_size)/kernel_stride+1),ofm_h);//keep Kernel_stride>=1
310+
// TR = MIN_diy(TR, TrxTc/TC);
311+
// uint16_t TRow = (TR-1)*kernel_stride + kernel_size;
312+
// assert(((TR*TC)>0)&&((TR*TC)<=TrxTc));
313+
// assert(((TRow*TCol)>0)&&((TRow*TCol)<=IB_HxW));
314+
// // printf("TR=%d, TC=%d, TRow=%d, TCol=%d\n", TR, TC, TRow, TCol);
315+
306316
uint16_t TR,TC,TM,TN;
307-
TC = MIN_diy(((IB_HxW-kernel_size)/kernel_stride+1),ofm_w);
308-
uint16_t TCol = (TC-1)*kernel_stride + kernel_size;
309-
TR = MIN_diy(((IB_HxW/TCol-kernel_size)/kernel_stride+1),ofm_h);//keep Kernel_stride>=1
317+
uint16_t TRow, TCol;
318+
assert((IB_HxW/l.size)>=l.size);
319+
TC = MIN_diy(((IB_HxW/l.size-l.size)/l.stride+1),ofm_w);
320+
TC = MIN_diy(TrxTc, TC);
321+
TCol = (TC-1)*l.stride + l.size;
322+
TR = MIN_diy(((IB_HxW/TCol-l.size)/l.stride+1),ofm_h);//keep Kernel_stride>=1
310323
TR = MIN_diy(TR, TrxTc/TC);
311-
uint16_t TRow = (TR-1)*kernel_stride + kernel_size;
312-
assert(((TR*TC)>0)&&((TR*TC)<=TrxTc));
313-
assert(((TRow*TCol)>0)&&((TRow*TCol)<=IB_HxW));
314-
// printf("TR=%d, TC=%d, TRow=%d, TCol=%d\n", TR, TC, TRow, TCol);
324+
TRow = (TR-1)*l.stride + l.size;
315325

316326
TM = MIN_diy(ofm_num,Tm);
317327
TN = MIN_diy(ifm_num,Tn);

software_version/02_ReorganizeWeight_Int16_128b/Makefile

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,3 +88,17 @@ clean:
8888
# tvmonitor: 84%
8989
# chair: 67%
9090
# YOLOv2 TEST End
91+
92+
# i16_test
93+
# [248]:h=0.223551,w=0.223980,x=0.125752,y=0.501132,objectness=0.895157
94+
# [256]:h=0.327496,w=0.076354,x=0.710994,y=0.525914,objectness=0.560493
95+
# [266]:h=0.235887,w=0.109372,x=0.508853,y=0.636246,objectness=0.795975
96+
# [267]:h=0.200978,w=0.120357,x=0.578593,y=0.619532,objectness=0.617859
97+
# [268]:h=0.221812,w=0.085847,x=0.647684,y=0.605883,objectness=0.610918
98+
# [272]:h=0.157442,w=0.126256,x=0.942223,y=0.587404,objectness=0.540196
99+
# [297]:h=0.220085,w=0.096521,x=0.877723,y=0.840422,objectness=0.546738
100+
# vase: 51%
101+
# tvmonitor: 89%
102+
# chair: 78%
103+
# chair: 58%
104+
# YOLOv2 TEST End

0 commit comments

Comments
 (0)