Skip to content

Support early print buffer flushing in debug mode #61

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/acl_hal.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ typedef void (*acl_device_update_callback)(
unsigned physical_device_id, CL_EXCEPTION_TYPE_INTEL exception_type,
void *user_private_info, size_t user_cb);
typedef void (*acl_process_printf_buffer_callback)(int activation_id, int size,
int stalled);
int debug_dump_printf);
///@}

typedef unsigned int mem_properties_t;
Expand Down
3 changes: 3 additions & 0 deletions include/acl_kernel_if.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ typedef struct {

// Track which of the kernels is the autorun profiling kernel (-1 if none)
int autorun_profiling_kernel_id;

// Track debug printf activity
time_ns last_printf_dump = 0;
} acl_kernel_if;

// *********************** Public functions **************************
Expand Down
8 changes: 8 additions & 0 deletions include/acl_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -922,6 +922,10 @@ typedef struct _cl_kernel {
// Eventually this should be an array of ACLDeviceBinaries similar
// to how cl_program contains an array of dev_prog.
const acl_device_binary_t *dev_bin;

// In ACL_HAL_DEBUG mode, printf buffer could be dumped before Kernel ends
// Therefore, we need to keep track of how much data has been processed.
size_t processed_printf_buffer_size;
} _cl_kernel;

ACL_DECLARE_CL_OBJECT_ALLOC_FUNCTIONS(cl_kernel);
Expand Down Expand Up @@ -1346,6 +1350,10 @@ typedef struct {
// it sees the CL_COMPLETE status for the first time, but it won't
// change after that.
cl_uint num_printf_bytes_pending;

// Indicate whether this operation is dumping printf buffer before the Kernel
// for debug purpose
int debug_dump_printf = 0;
} acl_device_op_info_t;

// An operation to be performed on a device.
Expand Down
3 changes: 3 additions & 0 deletions src/acl_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2419,6 +2419,9 @@ static int l_init_kernel(cl_kernel kernel, cl_program program,
// Check if there are printfs in the kernel
kernel->printf_device_buffer = 0; // Default is none.
kernel->printf_device_ptr = 0; // Default is none.
// Keep track of already processed buffer size
// It will be reset when the buffer is full and dumped.
kernel->processed_printf_buffer_size = 0;
if (!accel_def->printf_format_info.empty()) {
auto gmem_idx = static_cast<size_t>(
acl_get_default_memory(kernel->dev_bin->get_devdef()));
Expand Down
57 changes: 47 additions & 10 deletions src/acl_kernel_if.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1369,8 +1369,9 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) {
":: Calling acl_process_printf_buffer_fn with "
"activation_id=%d and printf_size=%u.\n",
activation_id, printf_size);
// update status, which will dump the printf buffer
acl_process_printf_buffer_fn(activation_id, (int)printf_size, 1);
// update status, which will dump the printf buffer, set
// debug_dump_printf = 0
acl_process_printf_buffer_fn(activation_id, (int)printf_size, 0);

ACL_KERNEL_IF_DEBUG_MSG(
kern, ":: Accelerator %d new csr is %x.\n", k,
Expand Down Expand Up @@ -1432,7 +1433,6 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) {
kern->accel_job_ids[k][next_queue_back] = -1;

#ifdef TEST_PROFILING_HARDWARE

// Test readback of fake profile data using the acl_hal_mmd function that
// would be called from the acl runtime.
ACL_KERNEL_IF_DEBUG_MSG(
Expand Down Expand Up @@ -1495,10 +1495,39 @@ void acl_kernel_if_update_status(acl_kernel_if *kern) {
}
}

void acl_kernel_if_debug_dump_printf(acl_kernel_if *kern, unsigned k) {
acl_assert_locked();
unsigned int printf_size = 0;
int activation_id;
unsigned int next_queue_back;

if (kern->accel_queue_back[k] == (int)kern->accel_invoc_queue_depth[k] - 1)
next_queue_back = 0;
else
next_queue_back = kern->accel_queue_back[k] + 1;

if (kern->accel_num_printfs[k] > 0) {
acl_kernel_cra_read(kern, k, KERNEL_OFFSET_PRINTF_BUFFER_SIZE,
&printf_size);
assert(printf_size <= ACL_PRINTF_BUFFER_TOTAL_SIZE);
ACL_KERNEL_IF_DEBUG_MSG(
kern, ":: Accelerator %d printf buffer size is %d.\n", k, printf_size);
activation_id = kern->accel_job_ids[k][next_queue_back];
ACL_KERNEL_IF_DEBUG_MSG(kern,
":: Calling acl_process_printf_buffer_fn with "
"activation_id=%d and printf_size=%u.\n",
activation_id, printf_size);

// set debug_dump_printf to 1
acl_process_printf_buffer_fn(activation_id, (int)printf_size, 1);
}
}

void acl_kernel_if_dump_status(acl_kernel_if *kern) {
int expect_kernel = 0;
unsigned k, i;
acl_assert_locked();

for (k = 0; k < kern->num_accel; ++k) {
for (i = 0; i < kern->accel_invoc_queue_depth[k]; ++i) {
if (kern->accel_job_ids[k][i] >= 0) {
Expand All @@ -1510,10 +1539,6 @@ void acl_kernel_if_dump_status(acl_kernel_if *kern) {
if (!expect_kernel)
return;

kern->io.printf("No kernel updates in approximately 10 seconds for device %u",
kern->physical_device_id);
kern->io.printf(" ... a kernel may be hung?\n");

for (k = 0; k < kern->num_accel; ++k) {
unsigned int csr;

Expand All @@ -1539,6 +1564,9 @@ void acl_kernel_if_dump_status(acl_kernel_if *kern) {
if (ACL_KERNEL_READ_BIT(csr, KERNEL_CSR_LMEM_INVALID_BANK))
kern->io.printf(" lm_bank_exception");

// Dump the printf buffer to stdout
acl_kernel_if_debug_dump_printf(kern, k);

unsigned buffered_kernel_invocation = 0;
for (i = 0; i < kern->accel_invoc_queue_depth[k]; ++i) {
unsigned int next_queue_back;
Expand Down Expand Up @@ -1576,14 +1604,23 @@ void acl_kernel_if_check_kernel_status(acl_kernel_if *kern) {
#endif
acl_assert_locked();

// Print kernel status if it hasn't done anything in a while
// If multiple thread calls this, only one will print every 10 seconds
if (kern->last_kern_update != 0 &&
(acl_kernel_if_get_time_us(kern) - kern->last_kern_update >
10 * 1000000)) {
kern->last_kern_update = acl_kernel_if_get_time_us(kern);
if (kern->io.debug_verbosity > 0)
kern->io.printf(
"No kernel updates in approximately 10 seconds for device %u",
kern->physical_device_id);
kern->io.printf(" ... a kernel may be hung?\n");
acl_kernel_if_dump_status(kern);
} else if (kern->io.debug_verbosity >= 3) {
// If ACL_HAL_DEBUG >= 3, the status will be printed even the server isn't
// hang every 10 seconds.
if (acl_kernel_if_get_time_us(kern) - kern->last_printf_dump >
10 * 1000000) {
kern->last_printf_dump = acl_kernel_if_get_time_us(kern);
acl_kernel_if_dump_status(kern);
}
}

#ifdef POLLING
Expand Down
79 changes: 57 additions & 22 deletions src/acl_printf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@
#pragma GCC visibility push(protected)
#endif

static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
unsigned size);
static size_t l_dump_printf_buffer(cl_event event, cl_kernel kernel,
unsigned size);
static void decode_string(std::string &print_data);

// stores to global memory are bound by smallest global memory bandwidth we
Expand Down Expand Up @@ -719,12 +719,13 @@ static std::string::const_iterator get_data_elem_at_offset(
return end_of_string;
}

static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
unsigned size) {
static size_t l_dump_printf_buffer(cl_event event, cl_kernel kernel,
unsigned size) {
unsigned global_offset; // the location in the printf buffer
unsigned single_printf_offset; // the offset of a single printf
void (*hal_dma_fn)(cl_event, const void *, void *, size_t) = 0;
int src_on_host = 1;
size_t dumped_buffer_size = 0;
#ifdef _WIN32
__declspec(align(64)) char
buffer[ACL_PRINTF_BUFFER_TOTAL_SIZE]; // Aligned to 64, for dma transfers
Expand All @@ -743,11 +744,11 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
if (!verify_types()) {
printf("Host data types are incompatible with ACL compiler, ignoring "
"printfs...\n");
return;
return dumped_buffer_size;
}

if (printf_infos.empty())
return;
return dumped_buffer_size;

// Memory is on the device if all of these are true:
// The memory is not SVM or the device does not support SVM.
Expand All @@ -769,8 +770,28 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
hal_dma_fn = acl_get_hal()->copy_globalmem_to_hostmem;
}

hal_dma_fn(NULL, kernel->printf_device_buffer->block_allocation->range.begin,
buffer, size);
// It needs the context from ACL_HAL_DEBUG instead of ACL_DEBUG
if (acl_get_hal()->get_debug_verbosity &&
acl_get_hal()->get_debug_verbosity() > 0) {
printf("Previously processed buffer size is %zu \n",
kernel->processed_printf_buffer_size);
}

// Check if we have already processed all the printf buffer
if (size > (unsigned int)kernel->processed_printf_buffer_size) {
void *unprocessed_begin = (void *)((char *)kernel->printf_device_buffer
->block_allocation->range.begin +
kernel->processed_printf_buffer_size);
assert(size >= kernel->processed_printf_buffer_size);
dumped_buffer_size = size - kernel->processed_printf_buffer_size;
hal_dma_fn(NULL, unprocessed_begin, buffer, dumped_buffer_size);
} else {
if (acl_get_hal()->get_debug_verbosity &&
acl_get_hal()->get_debug_verbosity() > 0) {
printf("All Printf() buffer has already been dumped \n");
}
return dumped_buffer_size;
}

#ifdef DEBUG
if (debug_mode > 0) {
Expand All @@ -789,11 +810,11 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
}
}
#endif

// always 32-byte aligned address (this may change if printf chunks can be
// of different sizes )
// process all the printfs as long as there is data
for (global_offset = 0, single_printf_offset = 0; global_offset < size;
for (global_offset = 0, single_printf_offset = 0;
global_offset < dumped_buffer_size;
global_offset += single_printf_offset) {

// the first 4-bytes is the index of the format string
Expand All @@ -820,7 +841,7 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
if (!success) {
acl_print_debug_msg(
"corrupt printf data, ignoring remaining printfs...\n");
return;
return dumped_buffer_size;
}

#ifdef DEBUG
Expand Down Expand Up @@ -883,7 +904,7 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
if (vector_size == -1) {
acl_print_debug_msg("wrong vector specifier in printf call, ignoring "
"remaining printfs...\n");
return;
return dumped_buffer_size;
}

// get the length specifier
Expand All @@ -904,7 +925,7 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
if (size_of_data == 0) {
acl_print_debug_msg("wrong length modifier in printf call, ignoring "
"remaining printfs...\n");
return;
return dumped_buffer_size;
}

for (i = 0; i < vector_size; i++) {
Expand Down Expand Up @@ -960,13 +981,14 @@ static void l_dump_printf_buffer(cl_event event, cl_kernel kernel,
#ifdef DEBUG
printf("exiting acl_dump_buffer...\n");
#endif
return dumped_buffer_size;
}

//
// Schedule enqueue read buffer to read printf buffer
// The activation ID is the device op ID.
void acl_schedule_printf_buffer_pickup(int activation_id, int size,
int stalled) {
int debug_dump_printf) {
acl_device_op_queue_t *doq = &(acl_platform.device_op_queue);

// This function can potentially be called by a HAL that does not use the
Expand All @@ -980,17 +1002,17 @@ void acl_schedule_printf_buffer_pickup(int activation_id, int size,
}

#ifdef DEBUG
printf("printf pickup %d %d %d\n", activation_id, size, stalled);
printf("printf pickup %d %d\n", activation_id, size);
fflush(stdout);
#endif
if (activation_id >= 0 && activation_id < doq->max_ops) {
// This address is stable, given a fixed activation_id.
// So we don't run into race conditions.
acl_device_op_t *op = doq->op + activation_id;

stalled = stalled; // this argument is no longer used!

op->info.num_printf_bytes_pending = (cl_uint)size;

// Propagate the operation info
op->info.debug_dump_printf = debug_dump_printf ? 1 : 0;
}
// Signal all waiters.
acl_signal_device_update();
Expand All @@ -1009,7 +1031,15 @@ void acl_process_printf_buffer(void *user_data, acl_device_op_t *op) {

// Grab the printf data and emit it.
cl_uint num_bytes = op->info.num_printf_bytes_pending;
l_dump_printf_buffer(event, kernel, num_bytes);
size_t dumped_buffer_size = l_dump_printf_buffer(event, kernel, num_bytes);

if (op->info.debug_dump_printf) {
// Update the already processed buffer size
kernel->processed_printf_buffer_size += dumped_buffer_size;
} else {
// Full dump, reset this variable
kernel->processed_printf_buffer_size = 0;
}

// Mark this printf work as done. Must do this *before* unstalling
// the kernel, to avoid a race against the kernel filling up the
Expand All @@ -1021,8 +1051,13 @@ void acl_process_printf_buffer(void *user_data, acl_device_op_t *op) {
acl_memory_barrier();

// Allow the kernel to continue running.
acl_get_hal()->unstall_kernel(
event->cmd.info.ndrange_kernel.device->def.physical_device_id, op->id);
// We don't need to unstall the kernel during the early flushing during
// debug.
if (!op->info.debug_dump_printf) {
acl_get_hal()->unstall_kernel(
event->cmd.info.ndrange_kernel.device->def.physical_device_id,
op->id);
}
}
}

Expand Down Expand Up @@ -1084,4 +1119,4 @@ static void decode_string(std::string &print_data) {

#ifdef __GNUC__
#pragma GCC visibility pop
#endif
#endif
5 changes: 5 additions & 0 deletions test/acl_device_op_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,7 @@ TEST(device_op, submit_action) {
CHECK(l_launch_kernel == the_call);
CHECK_EQUAL(0, unblocked);
CHECK_EQUAL(0, op->info.num_printf_bytes_pending);
CHECK_EQUAL(0, op->info.debug_dump_printf);
CHECK(prev < (latest = op->timestamp[CL_SUBMITTED]));
prev = latest;

Expand Down Expand Up @@ -400,6 +401,7 @@ TEST(device_op, submit_action) {
op->info.type = ACL_DEVICE_OP_KERNEL;
op->status = op->execution_status = CL_RUNNING; // Required
op->info.num_printf_bytes_pending = 1;
CHECK_EQUAL(0, op->info.debug_dump_printf);
acl_submit_device_op(&m_doq, op);
CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, submit_kind);
CHECK_EQUAL(op->info.type, submit_kind);
Expand All @@ -409,6 +411,7 @@ TEST(device_op, submit_action) {
CHECK_EQUAL(1, unblocked);
CHECK_EQUAL(0, op->info.num_printf_bytes_pending); // no longer marked as
// stalled on printf.
CHECK_EQUAL(0, op->info.debug_dump_printf);
// We didn't change the submit time.
CHECK_EQUAL(kernel_submit_time, op->timestamp[CL_SUBMITTED]);
prev = latest;
Expand All @@ -419,6 +422,7 @@ TEST(device_op, submit_action) {
op->info.type = ACL_DEVICE_OP_KERNEL;
op->status = op->execution_status = CL_COMPLETE; // Required
op->info.num_printf_bytes_pending = 1;
CHECK_EQUAL(0, op->info.debug_dump_printf);
acl_submit_device_op(&m_doq, op);
CHECK_EQUAL(ACL_DEVICE_OP_KERNEL, submit_kind);
CHECK_EQUAL(op->info.type, submit_kind);
Expand All @@ -428,6 +432,7 @@ TEST(device_op, submit_action) {
CHECK_EQUAL(1, unblocked);
CHECK_EQUAL(0, op->info.num_printf_bytes_pending); // no longer marked as
// stalled on printf.
CHECK_EQUAL(0, op->info.debug_dump_printf);
// We didn't change the submit time.
CHECK_EQUAL(kernel_submit_time, op->timestamp[CL_SUBMITTED]);
prev = latest;
Expand Down
Loading