38
38
#pragma GCC visibility push(protected)
39
39
#endif
40
40
41
- static void l_dump_printf_buffer (cl_event event, cl_kernel kernel,
42
- unsigned size);
41
+ static size_t l_dump_printf_buffer (cl_event event, cl_kernel kernel,
42
+ unsigned size);
43
43
static void decode_string (std::string &print_data);
44
44
45
45
// stores to global memory are bound by smallest global memory bandwidth we
@@ -719,12 +719,13 @@ static std::string::const_iterator get_data_elem_at_offset(
719
719
return end_of_string;
720
720
}
721
721
722
- static void l_dump_printf_buffer (cl_event event, cl_kernel kernel,
723
- unsigned size) {
722
+ static size_t l_dump_printf_buffer (cl_event event, cl_kernel kernel,
723
+ unsigned size) {
724
724
unsigned global_offset; // the location in the printf buffer
725
725
unsigned single_printf_offset; // the offset of a single printf
726
726
void (*hal_dma_fn)(cl_event, const void *, void *, size_t ) = 0 ;
727
727
int src_on_host = 1 ;
728
+ size_t dumped_buffer_size = 0 ;
728
729
#ifdef _WIN32
729
730
__declspec (align (64 )) char
730
731
buffer[ACL_PRINTF_BUFFER_TOTAL_SIZE]; // Aligned to 64, for dma transfers
@@ -743,11 +744,11 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
743
744
if (!verify_types ()) {
744
745
printf (" Host data types are incompatible with ACL compiler, ignoring "
745
746
" printfs...\n " );
746
- return ;
747
+ return dumped_buffer_size ;
747
748
}
748
749
749
750
if (printf_infos.empty ())
750
- return ;
751
+ return dumped_buffer_size ;
751
752
752
753
// Memory is on the device if all of these are true:
753
754
// The memory is not SVM or the device does not support SVM.
@@ -769,8 +770,28 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
769
770
hal_dma_fn = acl_get_hal ()->copy_globalmem_to_hostmem ;
770
771
}
771
772
772
- hal_dma_fn (NULL , kernel->printf_device_buffer ->block_allocation ->range .begin ,
773
- buffer, size);
773
+ // It needs the context from ACL_HAL_DEBUG instead of ACL_DEBUG
774
+ if (acl_get_hal ()->get_debug_verbosity &&
775
+ acl_get_hal ()->get_debug_verbosity () > 0 ) {
776
+ printf (" Previously processed buffer size is %zu \n " ,
777
+ kernel->processed_printf_buffer_size );
778
+ }
779
+
780
+ // Check if we have already processed all the printf buffer
781
+ if (size > (unsigned int )kernel->processed_printf_buffer_size ) {
782
+ void *unprocessed_begin = (void *)((char *)kernel->printf_device_buffer
783
+ ->block_allocation ->range .begin +
784
+ kernel->processed_printf_buffer_size );
785
+ assert (size >= kernel->processed_printf_buffer_size );
786
+ dumped_buffer_size = size - kernel->processed_printf_buffer_size ;
787
+ hal_dma_fn (NULL , unprocessed_begin, buffer, dumped_buffer_size);
788
+ } else {
789
+ if (acl_get_hal ()->get_debug_verbosity &&
790
+ acl_get_hal ()->get_debug_verbosity () > 0 ) {
791
+ printf (" All Printf() buffer has already been dumped \n " );
792
+ }
793
+ return dumped_buffer_size;
794
+ }
774
795
775
796
#ifdef DEBUG
776
797
if (debug_mode > 0 ) {
@@ -789,11 +810,11 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
789
810
}
790
811
}
791
812
#endif
792
-
793
813
// always 32-byte aligned address (this may change if printf chunks can be
794
814
// of different sizes )
795
815
// process all the printfs as long as there is data
796
- for (global_offset = 0 , single_printf_offset = 0 ; global_offset < size;
816
+ for (global_offset = 0 , single_printf_offset = 0 ;
817
+ global_offset < dumped_buffer_size;
797
818
global_offset += single_printf_offset) {
798
819
799
820
// the first 4-bytes is the index of the format string
@@ -820,7 +841,7 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
820
841
if (!success) {
821
842
acl_print_debug_msg (
822
843
" corrupt printf data, ignoring remaining printfs...\n " );
823
- return ;
844
+ return dumped_buffer_size ;
824
845
}
825
846
826
847
#ifdef DEBUG
@@ -883,7 +904,7 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
883
904
if (vector_size == -1 ) {
884
905
acl_print_debug_msg (" wrong vector specifier in printf call, ignoring "
885
906
" remaining printfs...\n " );
886
- return ;
907
+ return dumped_buffer_size ;
887
908
}
888
909
889
910
// get the length specifier
@@ -904,7 +925,7 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
904
925
if (size_of_data == 0 ) {
905
926
acl_print_debug_msg (" wrong length modifier in printf call, ignoring "
906
927
" remaining printfs...\n " );
907
- return ;
928
+ return dumped_buffer_size ;
908
929
}
909
930
910
931
for (i = 0 ; i < vector_size; i++) {
@@ -960,13 +981,14 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
960
981
#ifdef DEBUG
961
982
printf (" exiting acl_dump_buffer...\n " );
962
983
#endif
984
+ return dumped_buffer_size;
963
985
}
964
986
965
987
//
966
988
// Schedule enqueue read buffer to read printf buffer
967
989
// The activation ID is the device op ID.
968
990
void acl_schedule_printf_buffer_pickup (int activation_id, int size,
969
- int stalled ) {
991
+ int debug_dump_printf ) {
970
992
acl_device_op_queue_t *doq = &(acl_platform.device_op_queue );
971
993
972
994
// This function can potentially be called by a HAL that does not use the
@@ -980,17 +1002,17 @@ void acl_schedule_printf_buffer_pickup(int activation_id, int size,
980
1002
}
981
1003
982
1004
#ifdef DEBUG
983
- printf (" printf pickup %d %d %d \n " , activation_id, size, stalled );
1005
+ printf (" printf pickup %d %d\n " , activation_id, size);
984
1006
fflush (stdout);
985
1007
#endif
986
1008
if (activation_id >= 0 && activation_id < doq->max_ops ) {
987
1009
// This address is stable, given a fixed activation_id.
988
1010
// So we don't run into race conditions.
989
1011
acl_device_op_t *op = doq->op + activation_id;
990
-
991
- stalled = stalled; // this argument is no longer used!
992
-
993
1012
op->info .num_printf_bytes_pending = (cl_uint)size;
1013
+
1014
+ // Propagate the operation info
1015
+ op->info .debug_dump_printf = debug_dump_printf ? 1 : 0 ;
994
1016
}
995
1017
// Signal all waiters.
996
1018
acl_signal_device_update ();
@@ -1009,7 +1031,15 @@ void acl_process_printf_buffer(void *user_data, acl_device_op_t *op) {
1009
1031
1010
1032
// Grab the printf data and emit it.
1011
1033
cl_uint num_bytes = op->info .num_printf_bytes_pending ;
1012
- l_dump_printf_buffer (event, kernel, num_bytes);
1034
+ size_t dumped_buffer_size = l_dump_printf_buffer (event, kernel, num_bytes);
1035
+
1036
+ if (op->info .debug_dump_printf == 1 ) {
1037
+ // Update the already processed buffer size
1038
+ kernel->processed_printf_buffer_size += dumped_buffer_size;
1039
+ } else {
1040
+ // Full dump, reset this variable
1041
+ kernel->processed_printf_buffer_size = 0 ;
1042
+ }
1013
1043
1014
1044
// Mark this printf work as done. Must do this *before* unstalling
1015
1045
// the kernel, to avoid a race against the kernel filling up the
@@ -1021,8 +1051,13 @@ void acl_process_printf_buffer(void *user_data, acl_device_op_t *op) {
1021
1051
acl_memory_barrier ();
1022
1052
1023
1053
// Allow the kernel to continue running.
1024
- acl_get_hal ()->unstall_kernel (
1025
- event->cmd .info .ndrange_kernel .device ->def .physical_device_id , op->id );
1054
+ // We don't need to unstall the kernel during the early flushing during
1055
+ // debug.
1056
+ if (op->info .debug_dump_printf == 0 ) {
1057
+ acl_get_hal ()->unstall_kernel (
1058
+ event->cmd .info .ndrange_kernel .device ->def .physical_device_id ,
1059
+ op->id );
1060
+ }
1026
1061
}
1027
1062
}
1028
1063
@@ -1084,4 +1119,4 @@ static void decode_string(std::string &print_data) {
1084
1119
1085
1120
#ifdef __GNUC__
1086
1121
#pragma GCC visibility pop
1087
- #endif
1122
+ #endif
0 commit comments