FPGA4HEP · jmduarte · Mar 18, 2020
diff --git a/host b/host
diff --git a/src/alveo_hls4ml.cpp b/src/alveo_hls4ml.cpp
@@ -47,8 +47,8 @@ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    */
 extern "C" {
 void alveo_hls4ml(
-        const data_t *in, // Read-Only Vector
-        data_t *out       // Output Result
+        const bigdata_t *in, // Read-Only Vector
+        bigdata_t *out       // Output Result
         )
 {
 // SDAccel kernel must have one and only one s_axilite interface which will be used by host application to configure the kernel.
@@ -59,45 +59,59 @@ void alveo_hls4ml(
 // accessing global memory through this interface.
 // Multiple interfaces can also be created based on the requirements. For example when multiple memory accessing arguments need access to
 // global memory simultaneously, user can create multiple master interfaces and can connect to different arguments.
-#pragma HLS INTERFACE m_axi port=in  offset=slave bundle=gmem
-#pragma HLS INTERFACE m_axi port=out offset=slave bundle=gmem
+#pragma HLS INTERFACE m_axi port=in  offset=slave bundle=gmem0
+#pragma HLS INTERFACE m_axi port=out offset=slave bundle=gmem1
 #pragma HLS INTERFACE s_axilite port=in   bundle=control
 #pragma HLS INTERFACE s_axilite port=out  bundle=control
 #pragma HLS INTERFACE s_axilite port=return bundle=control
-  //#pragma HLS dataflow
-    unsigned short insize, outsize;
+    #pragma HLS DATAFLOW
     //necessary for hls4ml kernel, not used
+
+    bigdata_t in_bigbuf[STREAMSIZE];
+    bigdata_t out_bigbuf[STREAMSIZE];
 
     input_t in_buf[STREAMSIZE][DATA_SIZE_IN];
     layer6_t out_buf[STREAMSIZE][DATA_SIZE_OUT];
     //these will get partitioned properly in the hls4ml code
 
-    #pragma HLS ARRAY_PARTITION variable=in_buf complete dim=2
-    #pragma HLS ARRAY_PARTITION variable=out_buf complete dim=2 
-    //#pragma HLS ARRAY_RESHAPE   variable=in_buf block factor=11
-    //#pragma HLS ARRAY_RESHAPE   variable=out_buf block factor=11
-
-    //getting data from axi stream and formatting properly
+    //#pragma HLS ARRAY_RESHAPE   variable=in_bigbuf  block factor=64
+    //#pragma HLS ARRAY_RESHAPE   variable=out_bigbuf block factor=64
+    #pragma HLS ARRAY_RESHAPE   variable=in_buf  complete dim=2
+    #pragma HLS ARRAY_RESHAPE   variable=out_buf complete dim=2
+
+    //getting data from DDR
+    for (int i = 0; i < STREAMSIZE; i++) {
+      in_bigbuf[i] = in[i];
+    }
     for (int i = 0; i < STREAMSIZE; i++) {
-#pragma HLS PIPELINE II=11 rewind
-      for (int j = 0; j < DATA_SIZE_IN; j++) {
-	in_buf[i][j] = (input_t)in[i*DATA_SIZE_IN+j];
+      #pragma HLS PIPELINE
+      for(int i0 = 0; i0 < DATA_SIZE_IN; i0++) { 
+         #pragma HLS UNROLL
+	 in_buf[i][i0].range(31,0) = in_bigbuf[i].range(32*(i0+1)-1,32*i0);
       }
     }
-
     //run inference
+    //unsigned int count = 0; 
+    //unsigned int outcount = 0; 
     for (int i = 0; i < STREAMSIZE; i++) {
-#pragma HLS PIPELINE II=1 rewind
-    hls4ml: MYPROJ(in_buf[i],out_buf[i],insize,outsize);
+      #pragma HLS DATAFLOW
+      hls4ml: MYPROJ(in_buf[i],out_buf[i]);
     }
-
-    //place output into axi stream output
-    for (int i = 0; i < STREAMSIZE; i++) {
-#pragma HLS PIPELINE II=1 rewind
-      for (int j = 0; j < DATA_SIZE_OUT; j++) {
-	out[i*DATA_SIZE_OUT+j] = (data_t)out_buf[i][j];
+    for (int i = 0; i < COMPSTREAMSIZE; i++) {
+      #pragma HLS PIPELINE
+      bigdata_t tmp;
+      for(int i1 = 0; i1 < COMPRESSION;i1++) { 
+	for(int i0 = 0; i0 < DATA_SIZE_OUT; i0++) { 
+         #pragma HLS UNROLL
+	 tmp((i1+1)*32-1,(i1)*32) = out_buf[i*COMPRESSION+i1][i0].range(31,0);
+        }
       }
+      out_bigbuf[i] = tmp;
     }
-}
+    //place output into DDR
+    for (int i = 0; i < COMPSTREAMSIZE; i++) {
+     out[i] = out_bigbuf[i];
+    }
+  }
 }
 
diff --git a/src/ereg_v1.cpp b/src/ereg_v1.cpp
@@ -30,9 +30,9 @@
 
 void ereg_v1(
     input_t input[N_INPUT_1_1],
-    layer6_t layer6_out[N_LAYER_6],
-    unsigned short &const_size_in_1,
-    unsigned short &const_size_out_1
+    layer6_t layer6_out[N_LAYER_6]
+    //unsigned short &const_size_in_1,
+    //unsigned short &const_size_out_1
 ) {
 
     //hls-fpga-machine-learning insert IO
@@ -41,8 +41,8 @@ void ereg_v1(
     #pragma HLS INTERFACE ap_vld port=input,layer6_out 
     #pragma HLS PIPELINE 
 
-    const_size_in_1 = N_INPUT_1_1;
-    const_size_out_1 = N_LAYER_6;
+  //const_size_in_1 = N_INPUT_1_1;
+  //const_size_out_1 = N_LAYER_6;
 
 #ifndef __SYNTHESIS__
     static bool loaded_weights = false;

diff --git a/src/ereg_v1.h b/src/ereg_v1.h
@@ -30,9 +30,9 @@
 // Prototype of top level function for C-synthesis
 void ereg_v1(
     input_t input[N_INPUT_1_1],
-    layer6_t layer6_out[N_LAYER_6],
-    unsigned short &const_size_in_1,
-    unsigned short &const_size_out_1
+    layer6_t layer6_out[N_LAYER_6]
+    //unsigned short &const_size_in_1,
+    //unsigned short &const_size_out_1
 );
 
 #endif
diff --git a/src/host.cpp b/src/host.cpp
@@ -50,22 +50,22 @@ int main(int argc, char** argv)
     if (argc > 2) datadir = argv[2];
     std::cout << "Will run " << nevents << " time(s), using " << datadir << " to get input features and output predictions (tb_input_features.dat and tb_output_predictions.dat)" << std::endl;
 
-    size_t vector_size_in_bytes = sizeof(data_t) * DATA_SIZE_IN * STREAMSIZE;
-    size_t vector_size_out_bytes = sizeof(data_t) * DATA_SIZE_OUT * STREAMSIZE;
+    size_t vector_size_in_bytes = sizeof(bigdata_t) * STREAMSIZE;
+    size_t vector_size_out_bytes = sizeof(bigdata_t) * COMPSTREAMSIZE;
     // Allocate Memory in Host Memory
     // When creating a buffer with user pointer (CL_MEM_USE_HOST_PTR), under the hood user ptr 
     // is used if it is properly aligned. when not aligned, runtime had no choice but to create
     // its own host side buffer. So it is recommended to use this allocator if user wish to
     // create buffer using CL_MEM_USE_HOST_PTR to align user buffer to page boundary. It will 
     // ensure that user buffer is used when user create Buffer/Mem object with CL_MEM_USE_HOST_PTR 
-    std::vector<data_t,aligned_allocator<data_t>> source_in(DATA_SIZE_IN*STREAMSIZE);
-    std::vector<data_t,aligned_allocator<data_t>> source_hw_results(DATA_SIZE_OUT*STREAMSIZE);
+    std::vector<bigdata_t,aligned_allocator<bigdata_t>> source_in(STREAMSIZE);
+    std::vector<bigdata_t,aligned_allocator<bigdata_t>> source_hw_results(COMPSTREAMSIZE);
 
     //initialize
-    for(int j = 0 ; j < DATA_SIZE_IN*STREAMSIZE ; j++){
+    for(int j = 0 ; j < STREAMSIZE ; j++){
         source_in[j] = 0;
     }
-    for(int j = 0 ; j < DATA_SIZE_OUT*STREAMSIZE ; j++){
+    for(int j = 0 ; j < COMPSTREAMSIZE ; j++){
         source_hw_results[j] = 0;
     }
 
@@ -152,24 +152,24 @@ int main(int argc, char** argv)
                         current=strtok(NULL," ");
                     }
                     for (int j = 0; j < DATA_SIZE_IN; j++) {
-                        source_in[istream*DATA_SIZE_IN+j] = (data_t)in[j];
-                    }
-                    for(int j = 0 ; j < DATA_SIZE_OUT ; j++){
-                        source_hw_results[istream*DATA_SIZE_OUT+j] = 0;
+		      source_in[istream].range(32*(j+1)-1,32*j) =  ((data_t)in[j]).range(31,0);
                     }
+		    //data_t test;
+		    //test.range(31,0) = source_in[istream].range(32*11-1,32*10);
+		    //std::cout << "input check ===> " << source_in[istream] << " -- " <<  source_in[istream].range(31,0) << " -- " << test << " -- " << ((data_t)in[10]) << std::endl;
+		    if(istream % COMPRESSION == 0) source_hw_results[istream/COMPRESSION] = 0;
+
                 } else {
                     hit_end = true;
                 }
             }
             else {
             // Create the test data if no data files found or if end of files has been reached
                 for(int j = 0 ; j < DATA_SIZE_IN; j++){
-                    source_in[istream*DATA_SIZE_IN+j] = (data_t)(12.34*(j+DATA_SIZE_IN*STREAMSIZE*(i+1)));
-                    //this is just a random number to produce dummy input data
-                }
-                for(int j = 0 ; j < DATA_SIZE_OUT*STREAMSIZE ; j++){
-                    source_hw_results[j] = 0;
+  		  source_in[istream].range(32*j+1,32*j) = (data_t)(12.34*(j+DATA_SIZE_IN*STREAMSIZE*(i+1)));
+                  //this is just a random number to produce dummy input data
                 }
+		if(istream % COMPRESSION == 0) source_hw_results[istream/COMPRESSION] = 0;
             }
         }
 
@@ -197,12 +197,14 @@ int main(int argc, char** argv)
             std::cout << std::endl;
         }
         std::cout<<"Quantized predictions: \n";
-        for (int j = 0 ; j < STREAMSIZE ; j++){
-            for (int k = 0 ; k < DATA_SIZE_OUT ; k++){
-    	        std::cout << source_hw_results[j*DATA_SIZE_OUT + k] << " \t";
-                fout << source_hw_results[j*DATA_SIZE_OUT + k] << " "; 
+        for (int j = 0 ; j < COMPSTREAMSIZE ; j++){
+            for (int k = 0 ; k < COMPRESSION ; k++){
+	      data_t tmp;
+	      tmp.range(31,0) = source_hw_results[j].range((k+1)*32-1,k*32);
+	      std::cout << tmp  << " \n";
+              fout << tmp  << " \n "; 
             }
-            fout << "\n";
+            //fout << "\n";
         }
         std::cout << std::endl;
         std::cout<<"---- END EVENT "<<i+1<<" ----"<<std::endl;

diff --git a/src/kernel_params.h b/src/kernel_params.h
@@ -6,5 +6,8 @@
 
 #define DATA_SIZE_IN N_INPUT_1_1
 #define DATA_SIZE_OUT N_LAYER_6
+#define COMPRESSION 16
+#define COMPSTREAMSIZE 1024
 
 typedef ap_fixed<32,14> data_t;
+typedef ap_uint<512>    bigdata_t;
diff --git a/xclbin/alveo_hls4ml.hw.xilinx_u250_xdma_201830_2.xclbin b/xclbin/alveo_hls4ml.hw.xilinx_u250_xdma_201830_2.xclbin