-
Notifications
You must be signed in to change notification settings - Fork 70
Change implementation of acl_hal_mmd_get_timestamp() to use std::chro… #41
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
…no instead of clock_gettime.
Testing difference in kernel runtime using following design #include <iostream>
#include <chrono>
#include "ocl.h"
using namespace aocl_utils;
#ifndef ITERATION_TIME
#define ITERATION_TIME 5
#endif
// ACL runtime configuration
static cl_platform_id platform;
static cl_device_id device;
static cl_context context;
static cl_command_queue queue;
static cl_int status;
static cl_program program;
static cl_kernel kernel_dummy;
static cl_mem kernel_buf;
static Timer timer;
static cl_event evt;
struct BenchmarkTime get_exec_time_us(cl_event evt);
// free the resources allocated during initialization
static void freeResources() {
if (queue) clReleaseCommandQueue(queue);
if (context) clReleaseContext(context);
if (kernel_dummy) clReleaseKernel(kernel_dummy);
if (program) clReleaseProgram(program);
if(kernel_buf) clReleaseMemObject(kernel_buf);
}
static void dump_error(const char* str, cl_int status) {
printf("%s\n", str);
printf("Error code: %d\n", status);
freeResources();
exit(-1);
}
/******************************************************************************************/
/*************************************** MAIN LOGIC ***************************************/
/******************************************************************************************/
// Initiate the platform, device, context, queue, and other global variables
void device_init()
{
char buf[1000];
cl_uint num_devices;
size_t bin_file_len = 0;
const unsigned char* bin_file;
cl_int bin_status = 0;
printf("Initializing OpenCL\n");
#ifdef EMULATOR_FAST
platform = findPlatform("Intel(R) FPGA Emulation Platform for OpenCL(TM)");
#else
platform = findPlatform("Intel(R) FPGA SDK for OpenCL(TM)");
#endif
if(platform == NULL) {
printf("ERROR: Unable to find Intel(R) FPGA OpenCL platform.\n");
exit(-1);
}
// get the device ID
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, &num_devices);
if(status != CL_SUCCESS) dump_error("Failed clGetDeviceIDs.", status);
if(num_devices < 1) {
printf("Found %d devices!\n", num_devices);
freeResources();
exit(-1);
}
status = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(buf), (void*)&buf, NULL);
printf("Using Device with name: %s\n",buf);
status = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(buf), (void*)&buf, NULL);
printf("Using Device from vendor: %s\n",buf);
// create a context
context = clCreateContext(0, 1, &device, NULL, NULL, &status);
if(status != CL_SUCCESS) dump_error("Failed clCreateContext.", status);
// create a command queue
queue = clCreateCommandQueue(context, device, NULL, &status);
if(status != CL_SUCCESS) dump_error("Failed clCreateCommandQueue.", status);
// load the aocx
printf("Loading dummy.aocx...\n");
bin_file = load_file("dummy.aocx", &bin_file_len);
// create the program
program = clCreateProgramWithBinary(context, 1, &device, &bin_file_len, &bin_file, &bin_status, &status);
if(status != CL_SUCCESS) dump_error("Failed clCreateProgramWithBinary.", status);
// build the program
printf("build dummy_k program\n"); fflush(stdout);
status = clBuildProgram(program, 1, &device, "", 0, 0);
if(status != CL_SUCCESS) dump_error("Failed clBuildProgram.", status);
// create the kernel
printf("create kernel dummy_k\n"); fflush(stdout);
kernel_dummy = clCreateKernel(program, "dummy_k", &status);
if(status != CL_SUCCESS) dump_error("Failed clCreateKernel.", status);
// create the input buffer
kernel_buf = clCreateBuffer(context, CL_MEM_READ_WRITE, 1000, NULL, &status);
if(status != CL_SUCCESS) dump_error("Failed clCreateBuffer.", status);
printf("\n\n");
}
// Repeatedly run get_enqueueTask_time
void run_test()
{
// Empty queue just in case
clFinish(queue);
for (int i = 0; i < 100; i++) {
#ifdef UNRELEASED_AT_QUEUE_START
clEnqueueTask(queue, kernel_dummy, 0, NULL, NULL);
#else
clEnqueueTask(queue, kernel_dummy, 0, NULL, &evt);
#endif
clFinish(queue);
}
const auto tstart = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i < 100000; i++) {
#ifdef UNRELEASED_AT_QUEUE_START
clEnqueueTask(queue, kernel_dummy, 0, NULL, NULL);
#else
clEnqueueTask(queue, kernel_dummy, 0, NULL, &evt);
#endif
clFinish(queue);
}
const auto tend = std::chrono::high_resolution_clock::now();
// compute the average kernel latency across the measured iterations
const std::chrono::duration<double, std::milli> time_span = tend - tstart;
std::cout << "Overall Time: " << time_span.count() << "\n";
}
/******************************************************************************************/
/*********************************** MAIN LOGIC HELPER ************************************/
/******************************************************************************************/
// I can set the status and do error check inside the timer, but that extra time might skew the result.
// Since I don't know how fast these operations will be, I'll not do that. All operations are assumed to execute correctly
struct BenchmarkTime get_enqueueTask_time(bool verbose)
{
timer.start();
// Special case: Will not pass event in. As a result, we can't measure queue-submit-start time
#ifdef UNRELEASED_AT_QUEUE_START
clEnqueueTask(queue, kernel_dummy, 0, NULL, NULL);
#else
clEnqueueTask(queue, kernel_dummy, 0, NULL, &evt);
#endif
clFinish(queue);
timer.stop();
struct BenchmarkTime enqueueTask_time;
#ifndef UNRELEASED_AT_QUEUE_START
enqueueTask_time = get_exec_time_us(evt);
#endif
enqueueTask_time.execution_time = timer.get_time_s() * 1000000;
if (verbose)
print_benchmark_time(enqueueTask_time);
return enqueueTask_time;
}
unsigned char* load_file(const char* filename, size_t* size_ret)
{
FILE* fp = fopen(filename,"rb");
long size;
unsigned char *buffer;
if (!fp) {
return NULL;
}
fseek(fp, 0, SEEK_END);
size = ftell(fp);
fseek(fp, 0, SEEK_SET);
buffer = (unsigned char *) malloc(size);
fread(buffer, size, 1, fp);
fclose(fp);
*size_ret = size;
return buffer;
}
/******************************************************************************************/
/******************************** BENCHMARK_TIME UTILITY **********************************/
/******************************************************************************************/
struct BenchmarkTime get_exec_time_us(cl_event evt)
{
struct BenchmarkTime res;
cl_ulong kernelEventQueued;
cl_ulong kernelEventSubmit;
cl_ulong kernelEventStart;
cl_ulong kernelEventEnd;
clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_QUEUED, sizeof(unsigned long long), &kernelEventQueued, NULL);
clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(unsigned long long), &kernelEventSubmit, NULL);
clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START, sizeof(unsigned long long), &kernelEventStart, NULL);
clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END, sizeof(unsigned long long), &kernelEventEnd, NULL);
res.queued_submit_time = (float)(kernelEventSubmit - kernelEventQueued) / 1000;
res.submit_start_time = (float)(kernelEventStart - kernelEventSubmit) / 1000;
res.start_end_time = (float)(kernelEventEnd - kernelEventStart) / 1000;
return res;
}
void print_benchmark_time(struct BenchmarkTime benchmark_time) {
float cumul_time = 0.0f;
cumul_time += benchmark_time.queued_submit_time;
printf(" Queued-Submit time: %9.3lf us, cumulative time: %9.3lf us\n", benchmark_time.queued_submit_time, cumul_time);
cumul_time += benchmark_time.submit_start_time;
printf(" Submit-Start time: %9.3lf us, cumulative time: %9.3lf us\n", benchmark_time.submit_start_time, cumul_time);
cumul_time += benchmark_time.start_end_time;
printf(" Start-End time: %9.3lf us, cumulative time: %9.3lf us\n", benchmark_time.start_end_time, cumul_time);
printf(" Execution time: %9.3lf us\n", benchmark_time.execution_time);
printf("\n\n");
}
/******************************************************************************************/
/************************************* OTHER UTILITY **************************************/
/******************************************************************************************/
// Searches all platforms for the first platform whose name
// contains the search string (case-insensitive).
cl_platform_id findPlatform(const char *platform_name_search) {
cl_int status;
std::string search = platform_name_search;
std::transform(search.begin(), search.end(), search.begin(), tolower);
// Get number of platforms.
cl_uint num_platforms;
status = clGetPlatformIDs(0, NULL, &num_platforms);
if(status != CL_SUCCESS) dump_error("Query for number of platforms failed", status);
// Get a list of all platform ids.
scoped_array<cl_platform_id> pids(num_platforms);
status = clGetPlatformIDs(num_platforms, pids, NULL);
if(status != CL_SUCCESS) dump_error("Query for all platform ids failed", status);
// For each platform, get name and compare against the search string.
for(unsigned i = 0; i < num_platforms; ++i) {
std::string name = getPlatformName(pids[i]);
// Convert to lower case.
std::transform(name.begin(), name.end(), name.begin(), tolower);
if(name.find(search) != std::string::npos) {
// Found!
return pids[i];
}
}
// No platform found.
return NULL;
}
// Returns the platform name.
std::string getPlatformName(cl_platform_id pid) {
cl_int status;
size_t sz;
status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, NULL, &sz);
if(status != CL_SUCCESS) dump_error("Query for platform name size failed", status);
scoped_array<char> name(sz);
status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, sz, name, NULL);
if(status != CL_SUCCESS) dump_error("Query for platform name failed", status);
return name.get();
} |
Current benchmarks: old: 3278.61 ms old (no profiling): 3359.73 ms Looks like there is random variation. Going to do multiple runs and average out. Variation on regtest runs is in the 100s of ms. The improvement cannot be measured with this method |
…no instead of clock_gettime.