Skip to content

First set of run-time scalability improvements to scale gracefully to 8 and more FPGAs per single host program #137

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions include/acl_hal.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,12 @@ struct acl_pkg_file;
/// @name Callback type declarations
///@{
typedef void (*acl_event_update_callback)(cl_event event, int new_status);
typedef void (*acl_kernel_update_callback)(int activation_id, cl_int status);
typedef void (*acl_profile_callback)(int activation_id);
typedef void (*acl_kernel_update_callback)(unsigned int physical_device_id, int activation_id, cl_int status);
typedef void (*acl_profile_callback)(unsigned int physical_device_id, int activation_id);
typedef void (*acl_device_update_callback)(
unsigned physical_device_id, CL_EXCEPTION_TYPE_INTEL exception_type,
void *user_private_info, size_t user_cb);
typedef void (*acl_process_printf_buffer_callback)(int activation_id, int size,
typedef void (*acl_process_printf_buffer_callback)(unsigned int physical_device_id, int activation_id, int size,
int debug_dump_printf);
///@}

Expand Down
4 changes: 2 additions & 2 deletions include/acl_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@ void acl_launch_kernel(void *user_data, acl_device_op_t *op);

// Called when we get a kernel interrupt indicating that profiling data is ready
ACL_EXPORT
void acl_profile_update(int activation_id);
void acl_profile_update(unsigned int physical_device_id, int activation_id);

// This should be called by the HAL, to receive notification of RUNNING and
// COMPLETE state transitions, and used printf buffer size
ACL_EXPORT
void acl_receive_kernel_update(int activation_id, cl_int status);
void acl_receive_kernel_update(unsigned int physical_device_id, int activation_id, cl_int status);

// Used to check if one of the kernel arguments needs to be mapped to the device
// When unmapping subbuffers we may transfer memory that is currently used
Expand Down
5 changes: 5 additions & 0 deletions include/acl_platform.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ void acl_init_platform(void);
void acl_finalize_init_platform(unsigned int num_devices,
const cl_device_id *devices);
const char *acl_platform_extensions(void);
acl_device_op_queue_t *get_device_op_queue(unsigned int physical_device_id);
acl_device_op_queue_t *get_device_op_queue_from_context(cl_context context);

acl_locking_data_t *get_device_op_queue_locking_data(cl_device_id device);
acl_locking_data_t *get_device_op_queue_locking_data_from_context(cl_context context);

#if defined(__cplusplus)
} /* extern "C" */
Expand Down
2 changes: 1 addition & 1 deletion include/acl_printf.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ extern "C" {

// Enqueue printf buffer dump
ACL_EXPORT
void acl_schedule_printf_buffer_pickup(int activation_id, int size,
void acl_schedule_printf_buffer_pickup(unsigned int physical_device_id, int activation_id, int size,
int overflow);

// Print the printf data associated with the given deviced operation
Expand Down
111 changes: 82 additions & 29 deletions include/acl_thread.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,18 @@
#ifndef ACL_THREAD_H
#define ACL_THREAD_H

#include "acl.h"
#include "acl_context.h"
#include "acl_types.h"

// System headers.
#include <assert.h>
#include <stdarg.h>
#include <stdio.h>

// External library headers.
#include <acl_threadsupport/acl_threadsupport.h>

// Internal headers.
#include "acl.h"


#if defined(__cplusplus)
extern "C" {
#endif
Expand All @@ -23,6 +27,22 @@ extern "C" {
#define ACL_TLS __declspec(thread)
#endif


/* An opaque type for critical section + condition variable.
* Use indirection here so we don't force every module in the world to pull
* in windows.h.
*/
// typedef struct acl_condvar_s *acl_condvar_t;

typedef struct acl_locking_data_s acl_locking_data_t;
struct acl_locking_data_s {
struct acl_condvar_s condvar;
int lock_count;
int inside_sig_flag;
int inside_sig_old_lock_count;
};


extern ACL_TLS int acl_global_lock_count;
extern ACL_TLS int acl_inside_sig_flag;
extern ACL_TLS int acl_inside_sig_old_lock_count;
Expand All @@ -38,23 +58,46 @@ extern ACL_TLS int acl_inside_sig_old_lock_count;
// If a function needs an assert that passes if either the lock is held or
// inside a signal handler, it can use "acl_assert_locked_or_sig()".

static inline int acl_is_inside_sig() { return acl_inside_sig_flag; }
static inline int acl_is_inside_sig(acl_locking_data_t *locking_data = nullptr) {
if (locking_data == nullptr) {
return acl_inside_sig_flag;
} else {
return locking_data->inside_sig_flag;
}
}

static inline void acl_assert_inside_sig() { assert(acl_is_inside_sig()); }
static inline void acl_assert_inside_sig(acl_locking_data_t *locking_data = nullptr) {
assert(acl_is_inside_sig(locking_data));
}

static inline void acl_assert_outside_sig() { assert(!acl_is_inside_sig()); }
static inline void acl_assert_outside_sig(acl_locking_data_t *locking_data = nullptr) {
assert(!acl_is_inside_sig(locking_data));
}

static inline void acl_sig_started() {
assert(!acl_inside_sig_flag);
acl_inside_sig_flag = 1;
acl_inside_sig_old_lock_count = acl_global_lock_count;
acl_global_lock_count = 0;
static inline void acl_sig_started(acl_locking_data_t *locking_data = nullptr) {
if (locking_data == nullptr) {
assert(!acl_inside_sig_flag);
acl_inside_sig_flag = 1;
acl_inside_sig_old_lock_count = acl_global_lock_count;
acl_global_lock_count = 0;
} else {
assert(!locking_data->inside_sig_flag);
locking_data->inside_sig_flag = 1;
locking_data->inside_sig_old_lock_count = locking_data->lock_count;
locking_data->lock_count = 0;
}
}

static inline void acl_sig_finished() {
assert(acl_inside_sig_flag);
acl_inside_sig_flag = 0;
acl_global_lock_count = acl_inside_sig_old_lock_count;
static inline void acl_sig_finished(acl_locking_data_t *locking_data = nullptr) {
if (locking_data == nullptr) {
assert(acl_inside_sig_flag);
acl_inside_sig_flag = 0;
acl_global_lock_count = acl_inside_sig_old_lock_count;
} else {
assert(locking_data->inside_sig_flag);
locking_data->inside_sig_flag = 0;
locking_data->lock_count = locking_data->inside_sig_old_lock_count;
}
}

// Blocking/Unblocking signals (Only implemented for Linux)
Expand All @@ -75,31 +118,41 @@ static inline void acl_sig_unblock_signals() {

// -- global lock functions --

void acl_lock();
void acl_unlock();
int acl_suspend_lock();
void acl_resume_lock(int lock_count);
void acl_lock(acl_locking_data_t *locking_data = nullptr);
void acl_unlock(acl_locking_data_t *locking_data = nullptr);
int acl_suspend_lock(acl_locking_data_t *locking_data = nullptr);
void acl_resume_lock(int lock_count, acl_locking_data_t *locking_data = nullptr);
void acl_wait_for_device_update(cl_context context);
void acl_signal_device_update();

static inline int acl_is_locked() { return (acl_global_lock_count > 0); }
void acl_signal_device_update(acl_locking_data_t *locking_data = nullptr);

static inline int acl_is_locked(acl_locking_data_t *locking_data = nullptr) {
if (locking_data == nullptr) {
return acl_global_lock_count > 0;
} else {
return (locking_data->lock_count > 0);
}
}

// Used by dynamically loaded libs to check lock status.
int acl_is_locked_callback(void);
int acl_is_locked_callback(acl_locking_data_t *locking_data = nullptr);

static inline void acl_assert_locked() { assert(acl_is_locked()); }
static inline void acl_assert_locked(acl_locking_data_t *locking_data = nullptr) {
assert(acl_is_locked(locking_data));
}

static inline void acl_assert_locked_or_sig() {
assert(acl_is_locked() || acl_is_inside_sig());
static inline void acl_assert_locked_or_sig(acl_locking_data_t *locking_data = nullptr) {
assert(acl_is_locked(locking_data) || acl_is_inside_sig(locking_data));
}

static inline void acl_assert_unlocked() { assert(!acl_is_locked()); }
static inline void acl_assert_unlocked(acl_locking_data_t *locking_data = nullptr) {
assert(!acl_is_locked(locking_data));
}

// -- misc functions --

int acl_get_thread_id();
int acl_get_pid();
void acl_yield_lock_and_thread();
void acl_yield_lock_and_thread(acl_locking_data_t *locking_data = nullptr);

#if defined(__cplusplus)
} /* extern "C" */
Expand Down
33 changes: 26 additions & 7 deletions include/acl_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "acl_device_binary.h"
#include "acl_hal.h"
#include "acl_icd_dispatch.h"
#include "acl_thread.h"

#if defined(__cplusplus)
extern "C" {
Expand Down Expand Up @@ -235,12 +236,6 @@ typedef enum {
*/
#define CL_CONTEXT_COMPILE_COMMAND_INTELFPGA ACL_EXPERIMENTAL_ENUM(1)

/* An opaque type for critical section + condition variable.
* Use indirection here so we don't force every module in the world to pull
* in windows.h.
*/
typedef struct acl_condvar_s *acl_condvar_t;

typedef enum {
ACL_INVALID_EXECUTION_TRANSITION = -1,
ACL_INVALID_EXECUTION_STATUS = -2,
Expand Down Expand Up @@ -981,6 +976,7 @@ typedef struct _cl_context {
cl_uint refcount;
acl_compiler_mode_t compiler_mode;


// Is this context in the middle of being freed?
// Fix re-entrancy of clReleaseContext.
int is_being_freed;
Expand Down Expand Up @@ -1524,13 +1520,21 @@ typedef struct acl_device_op_queue_t {

acl_device_op_stats_t stats;

// per-context condition variable for finer-grained locking.
// Only operations on devices in the current context are proected
// by this condvar.
acl_locking_data_t locking_data;

// The operations themselves.
acl_device_op_t op[ACL_MAX_DEVICE_OPS];

// Used to cache the devices; indexed by physical_id
// Used for checking if the device has concurrent read/write support
acl_device_def_t *devices[ACL_MAX_DEVICE];

// Number of physical devices managed by this queue
int num_managed_devices;

// These function pointers must be set to the actions to be taken when
// kicking off various device activities.
void (*launch_kernel)(void *, acl_device_op_t *);
Expand Down Expand Up @@ -1633,7 +1637,22 @@ typedef struct _cl_platform_id

// The device operation queue.
// These are the operations that can run immediately on the device.
acl_device_op_queue_t device_op_queue;

// Map from physical device id to device op queue that this device belongs
// to. All devices in a single context belong to the same device op queue.
// If multiple contexts share even a single device, all devices in all these
// contexts share a single device op queue. Only if multiple contexts do not
// share even a single device will these devices belong to separate device
// op queues.
int physical_dev_id_to_doq_idx[ACL_MAX_DEVICE]; // [0..num_devices-1]

// Array of device_op_queues. A new queue Will be allocated as required
// during platform init time
int num_device_op_queues;
acl_device_op_queue_t *device_op_queues[ACL_MAX_DEVICE]; // [0..num_devices-1]

// TODO: REMOVE ME
// acl_device_op_queue_t device_op_queue;

// Limits. See clGetDeviceInfo for semantics.
unsigned int max_param_size;
Expand Down
2 changes: 1 addition & 1 deletion src/acl_command_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,7 @@ int acl_update_queue(cl_command_queue command_queue) {
}

// First nudge the device operation scheduler.
acl_update_device_op_queue(&(acl_platform.device_op_queue));
acl_update_device_op_queue(get_device_op_queue_from_context(command_queue->context));

if (command_queue->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
return acl_update_ooo_queue(command_queue);
Expand Down
31 changes: 31 additions & 0 deletions src/acl_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,19 @@ CL_API_ENTRY cl_int CL_API_CALL clReleaseContextIntelFPGA(cl_context context) {
context->device[i]->last_bin->unload_content();
}

#if 0
// TODO: Remove devices from device op queue that manages them.
// If a device op queue does not manage any devices, de-allocate it.
for (unsigned i = 0; i < context->num_devices; i++) {
unsigned int physical_device_id = context->device[i]->def.physical_device_id;
int cur_doq_idx = acl_platform.physical_dev_id_to_doq_idx[physical_device_id];
acl_platform.physical_dev_id_to_doq_idx[physical_device_id] = -1;

acl_device_op_queue_t *cur_doq = acl_platform.device_op_queues[cur_doq_idx];
cur_doq->num_managed_devices--;
}
#endif

// We have to close all devices associated with this context so they can be
// opened by other processes
acl_get_hal()->close_devices(context->num_devices, context->device);
Expand Down Expand Up @@ -343,6 +356,22 @@ CL_API_ENTRY cl_int CL_API_CALL clReleaseContextIntelFPGA(cl_context context) {
acl_free(context->command_queue);
}


#if 0
// disconnect devices managed by this context from the device op queue that
// manages them.
for (int i = 0; i < acl_platform.num_device_op_queues; i++) {
if (acl_platform.device_op_queues[i] != nullptr &&
acl_platform.device_op_queues[i]->num_managed_devices == 0) {
// Should all the ops on this queue be done by now? I hope so, we're about to
// delete the context!
acl_print_debug_msg("Deleting device op queue %d as no devices are managed by it\n", i);
//acl_free (acl_platform.device_op_queues[i]);
//acl_platform.device_op_queues[i] = nullptr;
}
}
#endif

clReleaseMemObject(context->unwrapped_host_mem);

l_forcibly_release_allocations(context);
Expand Down Expand Up @@ -1043,6 +1072,8 @@ static void l_forcibly_release_allocations(cl_context context) {
acl_release(context->device[idevice]);
}

// acl_

// Buffers might have been allocated.
acl_forcibly_release_all_memory_for_context(context);
acl_forcibly_release_all_svm_memory_for_context(context);
Expand Down
Loading