Skip to content

Change implementation of acl_hal_mmd_get_timestamp() to use std::chro… #41

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

huangbri
Copy link
Contributor

…no instead of clock_gettime.

@huangbri huangbri self-assigned this Dec 10, 2021
@huangbri huangbri added this to the 2022.2 milestone Dec 10, 2021
@pcolberg pcolberg modified the milestones: 2022.2, 2022.3 Jan 20, 2022
@huangbri
Copy link
Contributor Author

huangbri commented Mar 18, 2022

Testing difference in kernel runtime using following design

#include <iostream>
#include <chrono>
#include "ocl.h"

using namespace aocl_utils;

#ifndef ITERATION_TIME
#define ITERATION_TIME 5
#endif

// ACL runtime configuration
static cl_platform_id platform;
static cl_device_id device;
static cl_context context;
static cl_command_queue queue;
static cl_int status;
static cl_program program;
static cl_kernel kernel_dummy;
static cl_mem kernel_buf;

static Timer timer;
static cl_event evt;

struct BenchmarkTime get_exec_time_us(cl_event evt);

// free the resources allocated during initialization
static void freeResources() {
    if (queue)          clReleaseCommandQueue(queue);
    if (context)        clReleaseContext(context);
    if (kernel_dummy)   clReleaseKernel(kernel_dummy);
    if (program)        clReleaseProgram(program);
    if(kernel_buf)      clReleaseMemObject(kernel_buf);
}


static void dump_error(const char* str, cl_int status) {
    printf("%s\n", str);
    printf("Error code: %d\n", status);
    freeResources();
    exit(-1);
}


/******************************************************************************************/
/*************************************** MAIN LOGIC ***************************************/
/******************************************************************************************/

// Initiate the platform, device, context, queue, and other global variables
void device_init()
{
    char buf[1000];
    cl_uint num_devices;
    size_t bin_file_len = 0;
    const unsigned char* bin_file;
    cl_int bin_status = 0;

    printf("Initializing OpenCL\n");

#ifdef EMULATOR_FAST
    platform = findPlatform("Intel(R) FPGA Emulation Platform for OpenCL(TM)");
#else
    platform = findPlatform("Intel(R) FPGA SDK for OpenCL(TM)");
#endif
    if(platform == NULL) {
        printf("ERROR: Unable to find Intel(R) FPGA OpenCL platform.\n");
        exit(-1);
    }

    // get the device ID
    status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, &num_devices);
    if(status != CL_SUCCESS) dump_error("Failed clGetDeviceIDs.", status);
    if(num_devices < 1) {
        printf("Found %d devices!\n", num_devices);
        freeResources();
        exit(-1);
    }
    status = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(buf), (void*)&buf, NULL);
    printf("Using Device with name: %s\n",buf);
    status = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(buf), (void*)&buf, NULL);
    printf("Using Device from vendor: %s\n",buf);

    // create a context
    context = clCreateContext(0, 1, &device, NULL, NULL, &status);
    if(status != CL_SUCCESS) dump_error("Failed clCreateContext.", status);

    // create a command queue
    queue = clCreateCommandQueue(context, device, NULL, &status);
    if(status != CL_SUCCESS) dump_error("Failed clCreateCommandQueue.", status);

    // load the aocx
    printf("Loading dummy.aocx...\n");
    bin_file = load_file("dummy.aocx", &bin_file_len);

    // create the program
    program = clCreateProgramWithBinary(context, 1, &device, &bin_file_len, &bin_file, &bin_status, &status);
    if(status != CL_SUCCESS) dump_error("Failed clCreateProgramWithBinary.", status);

    // build the program
    printf("build dummy_k program\n"); fflush(stdout);
    status = clBuildProgram(program, 1, &device, "", 0, 0);
    if(status != CL_SUCCESS) dump_error("Failed clBuildProgram.", status);

    // create the kernel
    printf("create kernel dummy_k\n"); fflush(stdout);
    kernel_dummy = clCreateKernel(program, "dummy_k", &status);
    if(status != CL_SUCCESS) dump_error("Failed clCreateKernel.", status);

    // create the input buffer
    kernel_buf = clCreateBuffer(context, CL_MEM_READ_WRITE, 1000, NULL, &status);
    if(status != CL_SUCCESS) dump_error("Failed clCreateBuffer.", status);
    printf("\n\n");
}


// Repeatedly run get_enqueueTask_time
void run_test()
{
    // Empty queue just in case
    clFinish(queue);
    for (int i = 0; i < 100; i++) {
    #ifdef UNRELEASED_AT_QUEUE_START
        clEnqueueTask(queue, kernel_dummy, 0, NULL, NULL);
    #else
        clEnqueueTask(queue, kernel_dummy, 0, NULL, &evt);
    #endif
        clFinish(queue);
    }
    

    const auto tstart = std::chrono::high_resolution_clock::now();
    for (size_t i = 0; i < 100000; i++) {
    #ifdef UNRELEASED_AT_QUEUE_START
      clEnqueueTask(queue, kernel_dummy, 0, NULL, NULL);
    #else
      clEnqueueTask(queue, kernel_dummy, 0, NULL, &evt);
    #endif
      clFinish(queue);
    }
    const auto tend = std::chrono::high_resolution_clock::now();

    // compute the average kernel latency across the measured iterations
    const std::chrono::duration<double, std::milli> time_span = tend - tstart;


    std::cout << "Overall Time: " << time_span.count() << "\n";
}





/******************************************************************************************/
/*********************************** MAIN LOGIC HELPER ************************************/
/******************************************************************************************/

// I can set the status and do error check inside the timer, but that extra time might skew the result.
// Since I don't know how fast these operations will be, I'll not do that. All operations are assumed to execute correctly
struct BenchmarkTime get_enqueueTask_time(bool verbose)
{
    timer.start();
// Special case: Will not pass event in. As a result, we can't measure queue-submit-start time
#ifdef UNRELEASED_AT_QUEUE_START
    clEnqueueTask(queue, kernel_dummy, 0, NULL, NULL);
#else
    clEnqueueTask(queue, kernel_dummy, 0, NULL, &evt);
#endif
    clFinish(queue);
    timer.stop();

    struct BenchmarkTime enqueueTask_time;
#ifndef UNRELEASED_AT_QUEUE_START
    enqueueTask_time = get_exec_time_us(evt);
#endif
    enqueueTask_time.execution_time = timer.get_time_s() * 1000000;

    if (verbose)
        print_benchmark_time(enqueueTask_time);
    return enqueueTask_time;
}


unsigned char* load_file(const char* filename, size_t* size_ret)
{
    FILE* fp = fopen(filename,"rb");
    long size;
    unsigned char *buffer;

    if (!fp) {
      return NULL;
    }
    fseek(fp, 0, SEEK_END);
    size = ftell(fp);
    fseek(fp, 0, SEEK_SET);
    buffer = (unsigned char *) malloc(size);
    fread(buffer, size, 1, fp);
    fclose(fp);
    *size_ret = size;
    return buffer;
}

/******************************************************************************************/
/******************************** BENCHMARK_TIME UTILITY **********************************/
/******************************************************************************************/
struct BenchmarkTime get_exec_time_us(cl_event evt)
{
    struct BenchmarkTime res;
    cl_ulong kernelEventQueued;
    cl_ulong kernelEventSubmit;
    cl_ulong kernelEventStart;
    cl_ulong kernelEventEnd;
    clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_QUEUED, sizeof(unsigned long long), &kernelEventQueued, NULL);
    clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(unsigned long long), &kernelEventSubmit, NULL);
    clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START, sizeof(unsigned long long), &kernelEventStart, NULL);
    clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END, sizeof(unsigned long long), &kernelEventEnd, NULL);

    res.queued_submit_time = (float)(kernelEventSubmit - kernelEventQueued) / 1000;
    res.submit_start_time = (float)(kernelEventStart - kernelEventSubmit) / 1000;
    res.start_end_time = (float)(kernelEventEnd - kernelEventStart) / 1000;

    return res;
}


void print_benchmark_time(struct BenchmarkTime benchmark_time) {
    float cumul_time = 0.0f;
    cumul_time += benchmark_time.queued_submit_time;
    printf("        Queued-Submit time: %9.3lf us,  cumulative time: %9.3lf us\n", benchmark_time.queued_submit_time, cumul_time);
    cumul_time += benchmark_time.submit_start_time;
    printf("        Submit-Start time:  %9.3lf us,  cumulative time: %9.3lf us\n", benchmark_time.submit_start_time, cumul_time);
    cumul_time += benchmark_time.start_end_time;
    printf("        Start-End time:     %9.3lf us,  cumulative time: %9.3lf us\n", benchmark_time.start_end_time, cumul_time);

    printf("        Execution time:     %9.3lf us\n", benchmark_time.execution_time);
    printf("\n\n");
}


/******************************************************************************************/
/************************************* OTHER UTILITY **************************************/
/******************************************************************************************/

// Searches all platforms for the first platform whose name
// contains the search string (case-insensitive).
cl_platform_id findPlatform(const char *platform_name_search) {
    cl_int status;

    std::string search = platform_name_search;
    std::transform(search.begin(), search.end(), search.begin(), tolower);

    // Get number of platforms.
    cl_uint num_platforms;
    status = clGetPlatformIDs(0, NULL, &num_platforms);
    if(status != CL_SUCCESS) dump_error("Query for number of platforms failed", status);

    // Get a list of all platform ids.
    scoped_array<cl_platform_id> pids(num_platforms);
    status = clGetPlatformIDs(num_platforms, pids, NULL);
    if(status != CL_SUCCESS) dump_error("Query for all platform ids failed", status);

    // For each platform, get name and compare against the search string.
    for(unsigned i = 0; i < num_platforms; ++i) {
        std::string name = getPlatformName(pids[i]);

        // Convert to lower case.
        std::transform(name.begin(), name.end(), name.begin(), tolower);

        if(name.find(search) != std::string::npos) {
        // Found!
        return pids[i];
        }
    }

    // No platform found.
    return NULL;
}


// Returns the platform name.
std::string getPlatformName(cl_platform_id pid) {
    cl_int status;

    size_t sz;
    status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, NULL, &sz);
    if(status != CL_SUCCESS) dump_error("Query for platform name size failed", status);

    scoped_array<char> name(sz);
    status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, sz, name, NULL);
    if(status != CL_SUCCESS) dump_error("Query for platform name failed", status);

    return name.get();
}

@huangbri
Copy link
Contributor Author

huangbri commented Mar 18, 2022

Current benchmarks:

old: 3278.61 ms
new: 3457.42 ms

old (no profiling): 3359.73 ms
new (no profiling): 3171.21 ms

Looks like there is random variation. Going to do multiple runs and average out.

Variation on regtest runs is in the 100s of ms.
Change makes a 10^-4 ms improvement per enqueue, 10^1 ms improvement over 10^5 ms loops.

The improvement cannot be measured with this method

@pcolberg pcolberg assigned pcolberg and unassigned huangbri Apr 30, 2022
@pcolberg pcolberg modified the milestones: 2022.3, 2022.4 Jun 4, 2022
@pcolberg pcolberg removed this from the 2023.0 milestone Oct 19, 2022
@pcolberg pcolberg added the enhancement New feature or request label Dec 8, 2022
@pcolberg pcolberg removed their assignment Mar 15, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement New feature or request
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants