Skip to content

Commit 50fac71

Browse files
author
zibai.wang
committed
Support early print buffer flushing in debug mode
1 parent 92f392e commit 50fac71

File tree

5 files changed

+122
-32
lines changed

5 files changed

+122
-32
lines changed

include/acl_kernel_if.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ typedef struct {
5050

5151
// Track which of the kernels is the autorun profiling kernel (-1 if none)
5252
int autorun_profiling_kernel_id;
53+
54+
// Track debug printf activity
55+
time_ns last_printf_dump = 0;
5356
} acl_kernel_if;
5457

5558
// *********************** Public functions **************************

include/acl_types.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -922,6 +922,10 @@ typedef struct _cl_kernel {
922922
// Eventually this should be an array of ACLDeviceBinaries similar
923923
// to how cl_program contains an array of dev_prog.
924924
const acl_device_binary_t *dev_bin;
925+
926+
// In ACL_HAL_DEBUG mode, printf buffer could be dumped before Kernel ends
927+
// Therefore, we need to keep track of how much data has been processed.
928+
size_t processed_printf_buffer_size;
925929
} _cl_kernel;
926930

927931
ACL_DECLARE_CL_OBJECT_ALLOC_FUNCTIONS(cl_kernel);
@@ -1346,6 +1350,10 @@ typedef struct {
13461350
// it sees the CL_COMPLETE status for the first time, but it won't
13471351
// change after that.
13481352
cl_uint num_printf_bytes_pending;
1353+
1354+
// Indicate whether this operation is dumping printf buffer before the Kernel
1355+
// for debug purpose
1356+
int debug_dump_printf = 0;
13491357
} acl_device_op_info_t;
13501358

13511359
// An operation to be performed on a device.

src/acl_kernel.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2419,6 +2419,9 @@ static int l_init_kernel(cl_kernel kernel, cl_program program,
24192419
// Check if there are printfs in the kernel
24202420
kernel->printf_device_buffer = 0; // Default is none.
24212421
kernel->printf_device_ptr = 0; // Default is none.
2422+
// Keep track of already processed buffer size
2423+
// It will be reset when the buffer is full and dumped.
2424+
kernel->processed_printf_buffer_size = 0;
24222425
if (!accel_def->printf_format_info.empty()) {
24232426
auto gmem_idx = static_cast<size_t>(
24242427
acl_get_default_memory(kernel->dev_bin->get_devdef()));

src/acl_kernel_if.cpp

Lines changed: 47 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1369,8 +1369,9 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) {
13691369
":: Calling acl_process_printf_buffer_fn with "
13701370
"activation_id=%d and printf_size=%u.\n",
13711371
activation_id, printf_size);
1372-
// update status, which will dump the printf buffer
1373-
acl_process_printf_buffer_fn(activation_id, (int)printf_size, 1);
1372+
// update status, which will dump the printf buffer, set
1373+
// debug_dump_printf = 0
1374+
acl_process_printf_buffer_fn(activation_id, (int)printf_size, 0);
13741375

13751376
ACL_KERNEL_IF_DEBUG_MSG(
13761377
kern, ":: Accelerator %d new csr is %x.\n", k,
@@ -1432,7 +1433,6 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) {
14321433
kern->accel_job_ids[k][next_queue_back] = -1;
14331434

14341435
#ifdef TEST_PROFILING_HARDWARE
1435-
14361436
// Test readback of fake profile data using the acl_hal_mmd function that
14371437
// would be called from the acl runtime.
14381438
ACL_KERNEL_IF_DEBUG_MSG(
@@ -1495,10 +1495,39 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) {
14951495
}
14961496
}
14971497

1498+
void acl_kernel_if_debug_dump_printf(acl_kernel_if *kern, unsigned k) {
1499+
acl_assert_locked();
1500+
unsigned int printf_size = 0;
1501+
int activation_id;
1502+
unsigned int next_queue_back;
1503+
1504+
if (kern->accel_queue_back[k] == (int)kern->accel_invoc_queue_depth[k] - 1)
1505+
next_queue_back = 0;
1506+
else
1507+
next_queue_back = kern->accel_queue_back[k] + 1;
1508+
1509+
if (kern->accel_num_printfs[k] > 0) {
1510+
acl_kernel_cra_read(kern, k, KERNEL_OFFSET_PRINTF_BUFFER_SIZE,
1511+
&printf_size);
1512+
assert(printf_size <= ACL_PRINTF_BUFFER_TOTAL_SIZE);
1513+
ACL_KERNEL_IF_DEBUG_MSG(
1514+
kern, ":: Accelerator %d printf buffer size is %d.\n", k, printf_size);
1515+
activation_id = kern->accel_job_ids[k][next_queue_back];
1516+
ACL_KERNEL_IF_DEBUG_MSG(kern,
1517+
":: Calling acl_process_printf_buffer_fn with "
1518+
"activation_id=%d and printf_size=%u.\n",
1519+
activation_id, printf_size);
1520+
1521+
// set debug_dump_printf to 1
1522+
acl_process_printf_buffer_fn(activation_id, (int)printf_size, 1);
1523+
}
1524+
}
1525+
14981526
void acl_kernel_if_dump_status(acl_kernel_if *kern) {
14991527
int expect_kernel = 0;
15001528
unsigned k, i;
15011529
acl_assert_locked();
1530+
15021531
for (k = 0; k < kern->num_accel; ++k) {
15031532
for (i = 0; i < kern->accel_invoc_queue_depth[k]; ++i) {
15041533
if (kern->accel_job_ids[k][i] >= 0) {
@@ -1510,10 +1539,6 @@ void acl_kernel_if_dump_status(acl_kernel_if *kern) {
15101539
if (!expect_kernel)
15111540
return;
15121541

1513-
kern->io.printf("No kernel updates in approximately 10 seconds for device %u",
1514-
kern->physical_device_id);
1515-
kern->io.printf(" ... a kernel may be hung?\n");
1516-
15171542
for (k = 0; k < kern->num_accel; ++k) {
15181543
unsigned int csr;
15191544

@@ -1539,6 +1564,9 @@ void acl_kernel_if_dump_status(acl_kernel_if *kern) {
15391564
if (ACL_KERNEL_READ_BIT(csr, KERNEL_CSR_LMEM_INVALID_BANK))
15401565
kern->io.printf(" lm_bank_exception");
15411566

1567+
// Dump the printf buffer to stdout
1568+
acl_kernel_if_debug_dump_printf(kern, k);
1569+
15421570
unsigned buffered_kernel_invocation = 0;
15431571
for (i = 0; i < kern->accel_invoc_queue_depth[k]; ++i) {
15441572
unsigned int next_queue_back;
@@ -1576,14 +1604,23 @@ void acl_kernel_if_check_kernel_status(acl_kernel_if *kern) {
15761604
#endif
15771605
acl_assert_locked();
15781606

1579-
// Print kernel status if it hasn't done anything in a while
1580-
// If multiple thread calls this, only one will print every 10 seconds
15811607
if (kern->last_kern_update != 0 &&
15821608
(acl_kernel_if_get_time_us(kern) - kern->last_kern_update >
15831609
10 * 1000000)) {
15841610
kern->last_kern_update = acl_kernel_if_get_time_us(kern);
1585-
if (kern->io.debug_verbosity > 0)
1611+
kern->io.printf(
1612+
"No kernel updates in approximately 10 seconds for device %u",
1613+
kern->physical_device_id);
1614+
kern->io.printf(" ... a kernel may be hung?\n");
1615+
acl_kernel_if_dump_status(kern);
1616+
} else if (kern->io.debug_verbosity >= 3) {
1617+
// If ACL_HAL_DEBUG >= 3, the status will be printed even the server isn't
1618+
// hang every 10 seconds.
1619+
if (acl_kernel_if_get_time_us(kern) - kern->last_printf_dump >
1620+
10 * 1000000) {
1621+
kern->last_printf_dump = acl_kernel_if_get_time_us(kern);
15861622
acl_kernel_if_dump_status(kern);
1623+
}
15871624
}
15881625

15891626
#ifdef POLLING

src/acl_printf.cpp

Lines changed: 61 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@
3838
#pragma GCC visibility push(protected)
3939
#endif
4040

41-
static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
42-
unsigned size);
41+
static size_t l_dump_printf_buffer(cl_event event, cl_kernel kernel,
42+
unsigned size);
4343
static void decode_string(std::string &print_data);
4444

4545
// stores to global memory are bound by smallest global memory bandwidth we
@@ -719,12 +719,13 @@ static std::string::const_iterator get_data_elem_at_offset(
719719
return end_of_string;
720720
}
721721

722-
static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
723-
unsigned size) {
722+
static size_t l_dump_printf_buffer(cl_event event, cl_kernel kernel,
723+
unsigned size) {
724724
unsigned global_offset; // the location in the printf buffer
725725
unsigned single_printf_offset; // the offset of a single printf
726726
void (*hal_dma_fn)(cl_event, const void *, void *, size_t) = 0;
727727
int src_on_host = 1;
728+
size_t dumped_buffer_size = 0;
728729
#ifdef _WIN32
729730
__declspec(align(64)) char
730731
buffer[ACL_PRINTF_BUFFER_TOTAL_SIZE]; // Aligned to 64, for dma transfers
@@ -743,11 +744,11 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
743744
if (!verify_types()) {
744745
printf("Host data types are incompatible with ACL compiler, ignoring "
745746
"printfs...\n");
746-
return;
747+
return dumped_buffer_size;
747748
}
748749

749750
if (printf_infos.empty())
750-
return;
751+
return dumped_buffer_size;
751752

752753
// Memory is on the device if all of these are true:
753754
// The memory is not SVM or the device does not support SVM.
@@ -769,8 +770,28 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
769770
hal_dma_fn = acl_get_hal()->copy_globalmem_to_hostmem;
770771
}
771772

772-
hal_dma_fn(NULL, kernel->printf_device_buffer->block_allocation->range.begin,
773-
buffer, size);
773+
// It needs the context from ACL_HAL_DEBUG instead of ACL_DEBUG
774+
if (acl_get_hal()->get_debug_verbosity &&
775+
acl_get_hal()->get_debug_verbosity() > 0) {
776+
printf("Previously processed buffer size is %zu \n",
777+
kernel->processed_printf_buffer_size);
778+
}
779+
780+
// Check if we have already processed all the printf buffer
781+
if (size > (unsigned int)kernel->processed_printf_buffer_size) {
782+
void *unprocessed_begin = (void *)((char *)kernel->printf_device_buffer
783+
->block_allocation->range.begin +
784+
kernel->processed_printf_buffer_size);
785+
hal_dma_fn(NULL, unprocessed_begin, buffer,
786+
size - kernel->processed_printf_buffer_size);
787+
dumped_buffer_size = size - kernel->processed_printf_buffer_size;
788+
} else {
789+
if (acl_get_hal()->get_debug_verbosity &&
790+
acl_get_hal()->get_debug_verbosity() > 0) {
791+
printf("All Printf() buffer has already been dumped \n");
792+
}
793+
return dumped_buffer_size;
794+
}
774795

775796
#ifdef DEBUG
776797
if (debug_mode > 0) {
@@ -789,11 +810,11 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
789810
}
790811
}
791812
#endif
792-
793813
// always 32-byte aligned address (this may change if printf chunks can be
794814
// of different sizes )
795815
// process all the printfs as long as there is data
796-
for (global_offset = 0, single_printf_offset = 0; global_offset < size;
816+
for (global_offset = 0, single_printf_offset = 0;
817+
global_offset < dumped_buffer_size;
797818
global_offset += single_printf_offset) {
798819

799820
// the first 4-bytes is the index of the format string
@@ -820,7 +841,7 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
820841
if (!success) {
821842
acl_print_debug_msg(
822843
"corrupt printf data, ignoring remaining printfs...\n");
823-
return;
844+
return dumped_buffer_size;
824845
}
825846

826847
#ifdef DEBUG
@@ -883,7 +904,7 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
883904
if (vector_size == -1) {
884905
acl_print_debug_msg("wrong vector specifier in printf call, ignoring "
885906
"remaining printfs...\n");
886-
return;
907+
return dumped_buffer_size;
887908
}
888909

889910
// get the length specifier
@@ -904,7 +925,7 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
904925
if (size_of_data == 0) {
905926
acl_print_debug_msg("wrong length modifier in printf call, ignoring "
906927
"remaining printfs...\n");
907-
return;
928+
return dumped_buffer_size;
908929
}
909930

910931
for (i = 0; i < vector_size; i++) {
@@ -960,13 +981,14 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
960981
#ifdef DEBUG
961982
printf("exiting acl_dump_buffer...\n");
962983
#endif
984+
return dumped_buffer_size;
963985
}
964986

965987
//
966988
// Schedule enqueue read buffer to read printf buffer
967989
// The activation ID is the device op ID.
968990
void acl_schedule_printf_buffer_pickup(int activation_id, int size,
969-
int stalled) {
991+
int debug_dump_printf) {
970992
acl_device_op_queue_t *doq = &(acl_platform.device_op_queue);
971993

972994
// This function can potentially be called by a HAL that does not use the
@@ -980,17 +1002,21 @@ void acl_schedule_printf_buffer_pickup(int activation_id, int size,
9801002
}
9811003

9821004
#ifdef DEBUG
983-
printf("printf pickup %d %d %d\n", activation_id, size, stalled);
1005+
printf("printf pickup %d %d\n", activation_id, size);
9841006
fflush(stdout);
9851007
#endif
9861008
if (activation_id >= 0 && activation_id < doq->max_ops) {
9871009
// This address is stable, given a fixed activation_id.
9881010
// So we don't run into race conditions.
9891011
acl_device_op_t *op = doq->op + activation_id;
990-
991-
stalled = stalled; // this argument is no longer used!
992-
9931012
op->info.num_printf_bytes_pending = (cl_uint)size;
1013+
1014+
// Propagate the operation info
1015+
if (debug_dump_printf == 1) {
1016+
op->info.debug_dump_printf = 1;
1017+
} else {
1018+
op->info.debug_dump_printf = 0;
1019+
}
9941020
}
9951021
// Signal all waiters.
9961022
acl_signal_device_update();
@@ -1009,7 +1035,15 @@ void acl_process_printf_buffer(void *user_data, acl_device_op_t *op) {
10091035

10101036
// Grab the printf data and emit it.
10111037
cl_uint num_bytes = op->info.num_printf_bytes_pending;
1012-
l_dump_printf_buffer(event, kernel, num_bytes);
1038+
size_t dumped_buffer_size = l_dump_printf_buffer(event, kernel, num_bytes);
1039+
1040+
if (op->info.debug_dump_printf == 1) {
1041+
// Update the already processed buffer size
1042+
kernel->processed_printf_buffer_size += dumped_buffer_size;
1043+
} else {
1044+
// Full dump, reset this variable
1045+
kernel->processed_printf_buffer_size = 0;
1046+
}
10131047

10141048
// Mark this printf work as done. Must do this *before* unstalling
10151049
// the kernel, to avoid a race against the kernel filling up the
@@ -1021,8 +1055,13 @@ void acl_process_printf_buffer(void *user_data, acl_device_op_t *op) {
10211055
acl_memory_barrier();
10221056

10231057
// Allow the kernel to continue running.
1024-
acl_get_hal()->unstall_kernel(
1025-
event->cmd.info.ndrange_kernel.device->def.physical_device_id, op->id);
1058+
// We don't need to unstall the kernel during the early flushing during
1059+
// debug.
1060+
if (op->info.debug_dump_printf == 0) {
1061+
acl_get_hal()->unstall_kernel(
1062+
event->cmd.info.ndrange_kernel.device->def.physical_device_id,
1063+
op->id);
1064+
}
10261065
}
10271066
}
10281067

@@ -1084,4 +1123,4 @@ static void decode_string(std::string &print_data) {
10841123

10851124
#ifdef __GNUC__
10861125
#pragma GCC visibility pop
1087-
#endif
1126+
#endif

0 commit comments

Comments
 (0)