Skip to content

Commit c62dfe7

Browse files
author
zibai.wang
committed
Support early print buffer flushing in debug mode
Added a flow to flush printf() buffer before the Kernel ends. This will be helpful for debugging a hang Kernel. The Runtime periodically polls from the device and process the printf buffer from the last processed point. If ACL_HAL_DEBUG = 1 and the Kernel is Hang, Kernel status (existing feature) and printf buffer will be flushed to stdout every 10 seconds (approximately). The thread will run if there are spare cycles. If ACL_HAL_DEBUG = 3, Kernel status and printf buffer will flushed every 5 seconds, even the Kernel isn't hung. The behavior of printf() is not altered in non-debug mode.
1 parent 92f392e commit c62dfe7

File tree

8 files changed

+170
-41
lines changed

8 files changed

+170
-41
lines changed

include/acl_hal.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ typedef void (*acl_device_update_callback)(
6363
unsigned physical_device_id, CL_EXCEPTION_TYPE_INTEL exception_type,
6464
void *user_private_info, size_t user_cb);
6565
typedef void (*acl_process_printf_buffer_callback)(int activation_id, int size,
66-
int stalled);
66+
int debug_dump_printf);
6767
///@}
6868

6969
typedef unsigned int mem_properties_t;

include/acl_kernel_if.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ typedef struct {
5050

5151
// Track which of the kernels is the autorun profiling kernel (-1 if none)
5252
int autorun_profiling_kernel_id;
53+
54+
// Track debug printf activity
55+
time_ns last_printf_dump = 0;
5356
} acl_kernel_if;
5457

5558
// *********************** Public functions **************************

include/acl_types.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -922,6 +922,10 @@ typedef struct _cl_kernel {
922922
// Eventually this should be an array of ACLDeviceBinaries similar
923923
// to how cl_program contains an array of dev_prog.
924924
const acl_device_binary_t *dev_bin;
925+
926+
// In ACL_HAL_DEBUG mode, printf buffer could be dumped before Kernel ends
927+
// Therefore, we need to keep track of how much data has been processed.
928+
size_t processed_printf_buffer_size;
925929
} _cl_kernel;
926930

927931
ACL_DECLARE_CL_OBJECT_ALLOC_FUNCTIONS(cl_kernel);
@@ -1346,6 +1350,10 @@ typedef struct {
13461350
// it sees the CL_COMPLETE status for the first time, but it won't
13471351
// change after that.
13481352
cl_uint num_printf_bytes_pending;
1353+
1354+
// Indicate whether this operation is dumping printf buffer before the Kernel
1355+
// for debug purpose
1356+
int debug_dump_printf = 0;
13491357
} acl_device_op_info_t;
13501358

13511359
// An operation to be performed on a device.

src/acl_kernel.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2419,6 +2419,9 @@ static int l_init_kernel(cl_kernel kernel, cl_program program,
24192419
// Check if there are printfs in the kernel
24202420
kernel->printf_device_buffer = 0; // Default is none.
24212421
kernel->printf_device_ptr = 0; // Default is none.
2422+
// Keep track of already processed buffer size
2423+
// It will be reset when the buffer is full and dumped.
2424+
kernel->processed_printf_buffer_size = 0;
24222425
if (!accel_def->printf_format_info.empty()) {
24232426
auto gmem_idx = static_cast<size_t>(
24242427
acl_get_default_memory(kernel->dev_bin->get_devdef()));

src/acl_kernel_if.cpp

Lines changed: 47 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1369,8 +1369,9 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) {
13691369
":: Calling acl_process_printf_buffer_fn with "
13701370
"activation_id=%d and printf_size=%u.\n",
13711371
activation_id, printf_size);
1372-
// update status, which will dump the printf buffer
1373-
acl_process_printf_buffer_fn(activation_id, (int)printf_size, 1);
1372+
// update status, which will dump the printf buffer, set
1373+
// debug_dump_printf = 0
1374+
acl_process_printf_buffer_fn(activation_id, (int)printf_size, 0);
13741375

13751376
ACL_KERNEL_IF_DEBUG_MSG(
13761377
kern, ":: Accelerator %d new csr is %x.\n", k,
@@ -1432,7 +1433,6 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) {
14321433
kern->accel_job_ids[k][next_queue_back] = -1;
14331434

14341435
#ifdef TEST_PROFILING_HARDWARE
1435-
14361436
// Test readback of fake profile data using the acl_hal_mmd function that
14371437
// would be called from the acl runtime.
14381438
ACL_KERNEL_IF_DEBUG_MSG(
@@ -1495,10 +1495,39 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) {
14951495
}
14961496
}
14971497

1498+
void acl_kernel_if_debug_dump_printf(acl_kernel_if *kern, unsigned k) {
1499+
acl_assert_locked();
1500+
unsigned int printf_size = 0;
1501+
int activation_id;
1502+
unsigned int next_queue_back;
1503+
1504+
if (kern->accel_queue_back[k] == (int)kern->accel_invoc_queue_depth[k] - 1)
1505+
next_queue_back = 0;
1506+
else
1507+
next_queue_back = kern->accel_queue_back[k] + 1;
1508+
1509+
if (kern->accel_num_printfs[k] > 0) {
1510+
acl_kernel_cra_read(kern, k, KERNEL_OFFSET_PRINTF_BUFFER_SIZE,
1511+
&printf_size);
1512+
assert(printf_size <= ACL_PRINTF_BUFFER_TOTAL_SIZE);
1513+
ACL_KERNEL_IF_DEBUG_MSG(
1514+
kern, ":: Accelerator %d printf buffer size is %d.\n", k, printf_size);
1515+
activation_id = kern->accel_job_ids[k][next_queue_back];
1516+
ACL_KERNEL_IF_DEBUG_MSG(kern,
1517+
":: Calling acl_process_printf_buffer_fn with "
1518+
"activation_id=%d and printf_size=%u.\n",
1519+
activation_id, printf_size);
1520+
1521+
// set debug_dump_printf to 1
1522+
acl_process_printf_buffer_fn(activation_id, (int)printf_size, 1);
1523+
}
1524+
}
1525+
14981526
void acl_kernel_if_dump_status(acl_kernel_if *kern) {
14991527
int expect_kernel = 0;
15001528
unsigned k, i;
15011529
acl_assert_locked();
1530+
15021531
for (k = 0; k < kern->num_accel; ++k) {
15031532
for (i = 0; i < kern->accel_invoc_queue_depth[k]; ++i) {
15041533
if (kern->accel_job_ids[k][i] >= 0) {
@@ -1510,10 +1539,6 @@ void acl_kernel_if_dump_status(acl_kernel_if *kern) {
15101539
if (!expect_kernel)
15111540
return;
15121541

1513-
kern->io.printf("No kernel updates in approximately 10 seconds for device %u",
1514-
kern->physical_device_id);
1515-
kern->io.printf(" ... a kernel may be hung?\n");
1516-
15171542
for (k = 0; k < kern->num_accel; ++k) {
15181543
unsigned int csr;
15191544

@@ -1539,6 +1564,9 @@ void acl_kernel_if_dump_status(acl_kernel_if *kern) {
15391564
if (ACL_KERNEL_READ_BIT(csr, KERNEL_CSR_LMEM_INVALID_BANK))
15401565
kern->io.printf(" lm_bank_exception");
15411566

1567+
// Dump the printf buffer to stdout
1568+
acl_kernel_if_debug_dump_printf(kern, k);
1569+
15421570
unsigned buffered_kernel_invocation = 0;
15431571
for (i = 0; i < kern->accel_invoc_queue_depth[k]; ++i) {
15441572
unsigned int next_queue_back;
@@ -1576,14 +1604,23 @@ void acl_kernel_if_check_kernel_status(acl_kernel_if *kern) {
15761604
#endif
15771605
acl_assert_locked();
15781606

1579-
// Print kernel status if it hasn't done anything in a while
1580-
// If multiple thread calls this, only one will print every 10 seconds
15811607
if (kern->last_kern_update != 0 &&
15821608
(acl_kernel_if_get_time_us(kern) - kern->last_kern_update >
15831609
10 * 1000000)) {
15841610
kern->last_kern_update = acl_kernel_if_get_time_us(kern);
1585-
if (kern->io.debug_verbosity > 0)
1611+
kern->io.printf(
1612+
"No kernel updates in approximately 10 seconds for device %u",
1613+
kern->physical_device_id);
1614+
kern->io.printf(" ... a kernel may be hung?\n");
1615+
acl_kernel_if_dump_status(kern);
1616+
} else if (kern->io.debug_verbosity >= 3) {
1617+
// If ACL_HAL_DEBUG >= 3, the status will be printed even the server isn't
1618+
// hang every 10 seconds.
1619+
if (acl_kernel_if_get_time_us(kern) - kern->last_printf_dump >
1620+
10 * 1000000) {
1621+
kern->last_printf_dump = acl_kernel_if_get_time_us(kern);
15861622
acl_kernel_if_dump_status(kern);
1623+
}
15871624
}
15881625

15891626
#ifdef POLLING

src/acl_printf.cpp

Lines changed: 57 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@
3838
#pragma GCC visibility push(protected)
3939
#endif
4040

41-
static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
42-
unsigned size);
41+
static size_t l_dump_printf_buffer(cl_event event, cl_kernel kernel,
42+
unsigned size);
4343
static void decode_string(std::string &print_data);
4444

4545
// stores to global memory are bound by smallest global memory bandwidth we
@@ -719,12 +719,13 @@ static std::string::const_iterator get_data_elem_at_offset(
719719
return end_of_string;
720720
}
721721

722-
static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
723-
unsigned size) {
722+
static size_t l_dump_printf_buffer(cl_event event, cl_kernel kernel,
723+
unsigned size) {
724724
unsigned global_offset; // the location in the printf buffer
725725
unsigned single_printf_offset; // the offset of a single printf
726726
void (*hal_dma_fn)(cl_event, const void *, void *, size_t) = 0;
727727
int src_on_host = 1;
728+
size_t dumped_buffer_size = 0;
728729
#ifdef _WIN32
729730
__declspec(align(64)) char
730731
buffer[ACL_PRINTF_BUFFER_TOTAL_SIZE]; // Aligned to 64, for dma transfers
@@ -743,11 +744,11 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
743744
if (!verify_types()) {
744745
printf("Host data types are incompatible with ACL compiler, ignoring "
745746
"printfs...\n");
746-
return;
747+
return dumped_buffer_size;
747748
}
748749

749750
if (printf_infos.empty())
750-
return;
751+
return dumped_buffer_size;
751752

752753
// Memory is on the device if all of these are true:
753754
// The memory is not SVM or the device does not support SVM.
@@ -769,8 +770,28 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
769770
hal_dma_fn = acl_get_hal()->copy_globalmem_to_hostmem;
770771
}
771772

772-
hal_dma_fn(NULL, kernel->printf_device_buffer->block_allocation->range.begin,
773-
buffer, size);
773+
// It needs the context from ACL_HAL_DEBUG instead of ACL_DEBUG
774+
if (acl_get_hal()->get_debug_verbosity &&
775+
acl_get_hal()->get_debug_verbosity() > 0) {
776+
printf("Previously processed buffer size is %zu \n",
777+
kernel->processed_printf_buffer_size);
778+
}
779+
780+
// Check if we have already processed all the printf buffer
781+
if (size > (unsigned int)kernel->processed_printf_buffer_size) {
782+
void *unprocessed_begin = (void *)((char *)kernel->printf_device_buffer
783+
->block_allocation->range.begin +
784+
kernel->processed_printf_buffer_size);
785+
assert(size >= kernel->processed_printf_buffer_size);
786+
dumped_buffer_size = size - kernel->processed_printf_buffer_size;
787+
hal_dma_fn(NULL, unprocessed_begin, buffer, dumped_buffer_size);
788+
} else {
789+
if (acl_get_hal()->get_debug_verbosity &&
790+
acl_get_hal()->get_debug_verbosity() > 0) {
791+
printf("All Printf() buffer has already been dumped \n");
792+
}
793+
return dumped_buffer_size;
794+
}
774795

775796
#ifdef DEBUG
776797
if (debug_mode > 0) {
@@ -789,11 +810,11 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
789810
}
790811
}
791812
#endif
792-
793813
// always 32-byte aligned address (this may change if printf chunks can be
794814
// of different sizes )
795815
// process all the printfs as long as there is data
796-
for (global_offset = 0, single_printf_offset = 0; global_offset < size;
816+
for (global_offset = 0, single_printf_offset = 0;
817+
global_offset < dumped_buffer_size;
797818
global_offset += single_printf_offset) {
798819

799820
// the first 4-bytes is the index of the format string
@@ -820,7 +841,7 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
820841
if (!success) {
821842
acl_print_debug_msg(
822843
"corrupt printf data, ignoring remaining printfs...\n");
823-
return;
844+
return dumped_buffer_size;
824845
}
825846

826847
#ifdef DEBUG
@@ -883,7 +904,7 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
883904
if (vector_size == -1) {
884905
acl_print_debug_msg("wrong vector specifier in printf call, ignoring "
885906
"remaining printfs...\n");
886-
return;
907+
return dumped_buffer_size;
887908
}
888909

889910
// get the length specifier
@@ -904,7 +925,7 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
904925
if (size_of_data == 0) {
905926
acl_print_debug_msg("wrong length modifier in printf call, ignoring "
906927
"remaining printfs...\n");
907-
return;
928+
return dumped_buffer_size;
908929
}
909930

910931
for (i = 0; i < vector_size; i++) {
@@ -960,13 +981,14 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
960981
#ifdef DEBUG
961982
printf("exiting acl_dump_buffer...\n");
962983
#endif
984+
return dumped_buffer_size;
963985
}
964986

965987
//
966988
// Schedule enqueue read buffer to read printf buffer
967989
// The activation ID is the device op ID.
968990
void acl_schedule_printf_buffer_pickup(int activation_id, int size,
969-
int stalled) {
991+
int debug_dump_printf) {
970992
acl_device_op_queue_t *doq = &(acl_platform.device_op_queue);
971993

972994
// This function can potentially be called by a HAL that does not use the
@@ -980,17 +1002,17 @@ void acl_schedule_printf_buffer_pickup(int activation_id, int size,
9801002
}
9811003

9821004
#ifdef DEBUG
983-
printf("printf pickup %d %d %d\n", activation_id, size, stalled);
1005+
printf("printf pickup %d %d\n", activation_id, size);
9841006
fflush(stdout);
9851007
#endif
9861008
if (activation_id >= 0 && activation_id < doq->max_ops) {
9871009
// This address is stable, given a fixed activation_id.
9881010
// So we don't run into race conditions.
9891011
acl_device_op_t *op = doq->op + activation_id;
990-
991-
stalled = stalled; // this argument is no longer used!
992-
9931012
op->info.num_printf_bytes_pending = (cl_uint)size;
1013+
1014+
// Propagate the operation info
1015+
op->info.debug_dump_printf = debug_dump_printf ? 1 : 0;
9941016
}
9951017
// Signal all waiters.
9961018
acl_signal_device_update();
@@ -1009,7 +1031,15 @@ void acl_process_printf_buffer(void *user_data, acl_device_op_t *op) {
10091031

10101032
// Grab the printf data and emit it.
10111033
cl_uint num_bytes = op->info.num_printf_bytes_pending;
1012-
l_dump_printf_buffer(event, kernel, num_bytes);
1034+
size_t dumped_buffer_size = l_dump_printf_buffer(event, kernel, num_bytes);
1035+
1036+
if (op->info.debug_dump_printf == 1) {
1037+
// Update the already processed buffer size
1038+
kernel->processed_printf_buffer_size += dumped_buffer_size;
1039+
} else {
1040+
// Full dump, reset this variable
1041+
kernel->processed_printf_buffer_size = 0;
1042+
}
10131043

10141044
// Mark this printf work as done. Must do this *before* unstalling
10151045
// the kernel, to avoid a race against the kernel filling up the
@@ -1021,8 +1051,13 @@ void acl_process_printf_buffer(void *user_data, acl_device_op_t *op) {
10211051
acl_memory_barrier();
10221052

10231053
// Allow the kernel to continue running.
1024-
acl_get_hal()->unstall_kernel(
1025-
event->cmd.info.ndrange_kernel.device->def.physical_device_id, op->id);
1054+
// We don't need to unstall the kernel during the early flushing during
1055+
// debug.
1056+
if (op->info.debug_dump_printf == 0) {
1057+
acl_get_hal()->unstall_kernel(
1058+
event->cmd.info.ndrange_kernel.device->def.physical_device_id,
1059+
op->id);
1060+
}
10261061
}
10271062
}
10281063

@@ -1084,4 +1119,4 @@ static void decode_string(std::string &print_data) {
10841119

10851120
#ifdef __GNUC__
10861121
#pragma GCC visibility pop
1087-
#endif
1122+
#endif

test/acl_device_op_test.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,7 @@ TEST(device_op, submit_action) {
363363
CHECK(l_launch_kernel == the_call);
364364
CHECK_EQUAL(0, unblocked);
365365
CHECK_EQUAL(0, op->info.num_printf_bytes_pending);
366+
CHECK_EQUAL(0, op->info.debug_dump_printf);
366367
CHECK(prev < (latest = op->timestamp[CL_SUBMITTED]));
367368
prev = latest;
368369

@@ -400,6 +401,7 @@ TEST(device_op, submit_action) {
400401
op->info.type = ACL_DEVICE_OP_KERNEL;
401402
op->status = op->execution_status = CL_RUNNING; // Required
402403
op->info.num_printf_bytes_pending = 1;
404+
CHECK_EQUAL(0, op->info.debug_dump_printf);
403405
acl_submit_device_op(&m_doq, op);
404406
CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, submit_kind);
405407
CHECK_EQUAL(op->info.type, submit_kind);
@@ -409,6 +411,7 @@ TEST(device_op, submit_action) {
409411
CHECK_EQUAL(1, unblocked);
410412
CHECK_EQUAL(0, op->info.num_printf_bytes_pending); // no longer marked as
411413
// stalled on printf.
414+
CHECK_EQUAL(0, op->info.debug_dump_printf);
412415
// We didn't change the submit time.
413416
CHECK_EQUAL(kernel_submit_time, op->timestamp[CL_SUBMITTED]);
414417
prev = latest;
@@ -419,6 +422,7 @@ TEST(device_op, submit_action) {
419422
op->info.type = ACL_DEVICE_OP_KERNEL;
420423
op->status = op->execution_status = CL_COMPLETE; // Required
421424
op->info.num_printf_bytes_pending = 1;
425+
CHECK_EQUAL(0, op->info.debug_dump_printf);
422426
acl_submit_device_op(&m_doq, op);
423427
CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, submit_kind);
424428
CHECK_EQUAL(op->info.type, submit_kind);
@@ -428,6 +432,7 @@ TEST(device_op, submit_action) {
428432
CHECK_EQUAL(1, unblocked);
429433
CHECK_EQUAL(0, op->info.num_printf_bytes_pending); // no longer marked as
430434
// stalled on printf.
435+
CHECK_EQUAL(0, op->info.debug_dump_printf);
431436
// We didn't change the submit time.
432437
CHECK_EQUAL(kernel_submit_time, op->timestamp[CL_SUBMITTED]);
433438
prev = latest;

0 commit comments

Comments
 (0)