Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified host
Binary file not shown.
64 changes: 39 additions & 25 deletions src/alveo_hls4ml.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
extern "C" {
void alveo_hls4ml(
const data_t *in, // Read-Only Vector
data_t *out // Output Result
const bigdata_t *in, // Read-Only Vector
bigdata_t *out // Output Result
)
{
// SDAccel kernel must have one and only one s_axilite interface which will be used by host application to configure the kernel.
Expand All @@ -59,45 +59,59 @@ void alveo_hls4ml(
// accessing global memory through this interface.
// Multiple interfaces can also be created based on the requirements. For example when multiple memory accessing arguments need access to
// global memory simultaneously, user can create multiple master interfaces and can connect to different arguments.
#pragma HLS INTERFACE m_axi port=in offset=slave bundle=gmem
#pragma HLS INTERFACE m_axi port=out offset=slave bundle=gmem
#pragma HLS INTERFACE m_axi port=in offset=slave bundle=gmem0
#pragma HLS INTERFACE m_axi port=out offset=slave bundle=gmem1
#pragma HLS INTERFACE s_axilite port=in bundle=control
#pragma HLS INTERFACE s_axilite port=out bundle=control
#pragma HLS INTERFACE s_axilite port=return bundle=control
//#pragma HLS dataflow
unsigned short insize, outsize;
#pragma HLS DATAFLOW
//necessary for hls4ml kernel, not used

bigdata_t in_bigbuf[STREAMSIZE];
bigdata_t out_bigbuf[STREAMSIZE];

input_t in_buf[STREAMSIZE][DATA_SIZE_IN];
layer6_t out_buf[STREAMSIZE][DATA_SIZE_OUT];
//these will get partitioned properly in the hls4ml code

#pragma HLS ARRAY_PARTITION variable=in_buf complete dim=2
#pragma HLS ARRAY_PARTITION variable=out_buf complete dim=2
//#pragma HLS ARRAY_RESHAPE variable=in_buf block factor=11
//#pragma HLS ARRAY_RESHAPE variable=out_buf block factor=11

//getting data from axi stream and formatting properly
//#pragma HLS ARRAY_RESHAPE variable=in_bigbuf block factor=64
//#pragma HLS ARRAY_RESHAPE variable=out_bigbuf block factor=64
#pragma HLS ARRAY_RESHAPE variable=in_buf complete dim=2
#pragma HLS ARRAY_RESHAPE variable=out_buf complete dim=2

//getting data from DDR
for (int i = 0; i < STREAMSIZE; i++) {
in_bigbuf[i] = in[i];
}
for (int i = 0; i < STREAMSIZE; i++) {
#pragma HLS PIPELINE II=11 rewind
for (int j = 0; j < DATA_SIZE_IN; j++) {
in_buf[i][j] = (input_t)in[i*DATA_SIZE_IN+j];
#pragma HLS PIPELINE
for(int i0 = 0; i0 < DATA_SIZE_IN; i0++) {
#pragma HLS UNROLL
in_buf[i][i0].range(31,0) = in_bigbuf[i].range(32*(i0+1)-1,32*i0);
}
}

//run inference
//unsigned int count = 0;
//unsigned int outcount = 0;
for (int i = 0; i < STREAMSIZE; i++) {
#pragma HLS PIPELINE II=1 rewind
hls4ml: MYPROJ(in_buf[i],out_buf[i],insize,outsize);
#pragma HLS DATAFLOW
hls4ml: MYPROJ(in_buf[i],out_buf[i]);
}

//place output into axi stream output
for (int i = 0; i < STREAMSIZE; i++) {
#pragma HLS PIPELINE II=1 rewind
for (int j = 0; j < DATA_SIZE_OUT; j++) {
out[i*DATA_SIZE_OUT+j] = (data_t)out_buf[i][j];
for (int i = 0; i < COMPSTREAMSIZE; i++) {
#pragma HLS PIPELINE
bigdata_t tmp;
for(int i1 = 0; i1 < COMPRESSION;i1++) {
for(int i0 = 0; i0 < DATA_SIZE_OUT; i0++) {
#pragma HLS UNROLL
tmp((i1+1)*32-1,(i1)*32) = out_buf[i*COMPRESSION+i1][i0].range(31,0);
}
}
out_bigbuf[i] = tmp;
}
}
//place output into DDR
for (int i = 0; i < COMPSTREAMSIZE; i++) {
out[i] = out_bigbuf[i];
}
}
}

10 changes: 5 additions & 5 deletions src/ereg_v1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@

void ereg_v1(
input_t input[N_INPUT_1_1],
layer6_t layer6_out[N_LAYER_6],
unsigned short &const_size_in_1,
unsigned short &const_size_out_1
layer6_t layer6_out[N_LAYER_6]
//unsigned short &const_size_in_1,
//unsigned short &const_size_out_1
) {

//hls-fpga-machine-learning insert IO
Expand All @@ -41,8 +41,8 @@ void ereg_v1(
#pragma HLS INTERFACE ap_vld port=input,layer6_out
#pragma HLS PIPELINE

const_size_in_1 = N_INPUT_1_1;
const_size_out_1 = N_LAYER_6;
//const_size_in_1 = N_INPUT_1_1;
//const_size_out_1 = N_LAYER_6;

#ifndef __SYNTHESIS__
static bool loaded_weights = false;
Expand Down
6 changes: 3 additions & 3 deletions src/ereg_v1.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@
// Prototype of top level function for C-synthesis
void ereg_v1(
input_t input[N_INPUT_1_1],
layer6_t layer6_out[N_LAYER_6],
unsigned short &const_size_in_1,
unsigned short &const_size_out_1
layer6_t layer6_out[N_LAYER_6]
//unsigned short &const_size_in_1,
//unsigned short &const_size_out_1
);

#endif
42 changes: 22 additions & 20 deletions src/host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,22 +50,22 @@ int main(int argc, char** argv)
if (argc > 2) datadir = argv[2];
std::cout << "Will run " << nevents << " time(s), using " << datadir << " to get input features and output predictions (tb_input_features.dat and tb_output_predictions.dat)" << std::endl;

size_t vector_size_in_bytes = sizeof(data_t) * DATA_SIZE_IN * STREAMSIZE;
size_t vector_size_out_bytes = sizeof(data_t) * DATA_SIZE_OUT * STREAMSIZE;
size_t vector_size_in_bytes = sizeof(bigdata_t) * STREAMSIZE;
size_t vector_size_out_bytes = sizeof(bigdata_t) * COMPSTREAMSIZE;
// Allocate Memory in Host Memory
// When creating a buffer with user pointer (CL_MEM_USE_HOST_PTR), under the hood user ptr
// is used if it is properly aligned. when not aligned, runtime had no choice but to create
// its own host side buffer. So it is recommended to use this allocator if user wish to
// create buffer using CL_MEM_USE_HOST_PTR to align user buffer to page boundary. It will
// ensure that user buffer is used when user create Buffer/Mem object with CL_MEM_USE_HOST_PTR
std::vector<data_t,aligned_allocator<data_t>> source_in(DATA_SIZE_IN*STREAMSIZE);
std::vector<data_t,aligned_allocator<data_t>> source_hw_results(DATA_SIZE_OUT*STREAMSIZE);
std::vector<bigdata_t,aligned_allocator<bigdata_t>> source_in(STREAMSIZE);
std::vector<bigdata_t,aligned_allocator<bigdata_t>> source_hw_results(COMPSTREAMSIZE);

//initialize
for(int j = 0 ; j < DATA_SIZE_IN*STREAMSIZE ; j++){
for(int j = 0 ; j < STREAMSIZE ; j++){
source_in[j] = 0;
}
for(int j = 0 ; j < DATA_SIZE_OUT*STREAMSIZE ; j++){
for(int j = 0 ; j < COMPSTREAMSIZE ; j++){
source_hw_results[j] = 0;
}

Expand Down Expand Up @@ -152,24 +152,24 @@ int main(int argc, char** argv)
current=strtok(NULL," ");
}
for (int j = 0; j < DATA_SIZE_IN; j++) {
source_in[istream*DATA_SIZE_IN+j] = (data_t)in[j];
}
for(int j = 0 ; j < DATA_SIZE_OUT ; j++){
source_hw_results[istream*DATA_SIZE_OUT+j] = 0;
source_in[istream].range(32*(j+1)-1,32*j) = ((data_t)in[j]).range(31,0);
}
//data_t test;
//test.range(31,0) = source_in[istream].range(32*11-1,32*10);
//std::cout << "input check ===> " << source_in[istream] << " -- " << source_in[istream].range(31,0) << " -- " << test << " -- " << ((data_t)in[10]) << std::endl;
if(istream % COMPRESSION == 0) source_hw_results[istream/COMPRESSION] = 0;

} else {
hit_end = true;
}
}
else {
// Create the test data if no data files found or if end of files has been reached
for(int j = 0 ; j < DATA_SIZE_IN; j++){
source_in[istream*DATA_SIZE_IN+j] = (data_t)(12.34*(j+DATA_SIZE_IN*STREAMSIZE*(i+1)));
//this is just a random number to produce dummy input data
}
for(int j = 0 ; j < DATA_SIZE_OUT*STREAMSIZE ; j++){
source_hw_results[j] = 0;
source_in[istream].range(32*j+1,32*j) = (data_t)(12.34*(j+DATA_SIZE_IN*STREAMSIZE*(i+1)));
//this is just a random number to produce dummy input data
}
if(istream % COMPRESSION == 0) source_hw_results[istream/COMPRESSION] = 0;
}
}

Expand Down Expand Up @@ -197,12 +197,14 @@ int main(int argc, char** argv)
std::cout << std::endl;
}
std::cout<<"Quantized predictions: \n";
for (int j = 0 ; j < STREAMSIZE ; j++){
for (int k = 0 ; k < DATA_SIZE_OUT ; k++){
std::cout << source_hw_results[j*DATA_SIZE_OUT + k] << " \t";
fout << source_hw_results[j*DATA_SIZE_OUT + k] << " ";
for (int j = 0 ; j < COMPSTREAMSIZE ; j++){
for (int k = 0 ; k < COMPRESSION ; k++){
data_t tmp;
tmp.range(31,0) = source_hw_results[j].range((k+1)*32-1,k*32);
std::cout << tmp << " \n";
fout << tmp << " \n ";
}
fout << "\n";
//fout << "\n";
}
std::cout << std::endl;
std::cout<<"---- END EVENT "<<i+1<<" ----"<<std::endl;
Expand Down
3 changes: 3 additions & 0 deletions src/kernel_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,8 @@

#define DATA_SIZE_IN N_INPUT_1_1
#define DATA_SIZE_OUT N_LAYER_6
#define COMPRESSION 16
#define COMPSTREAMSIZE 1024

typedef ap_fixed<32,14> data_t;
typedef ap_uint<512> bigdata_t;
Binary file modified xclbin/alveo_hls4ml.hw.xilinx_u250_xdma_201830_2.xclbin
Binary file not shown.