Skip to content

[SYCL] Initial commit of interop example #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion example-01/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ endif(NOT SYCL_ROOT)
set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/11.0.0/include/")
set(SYCL_LIB "${SYCL_ROOT}/lib/libsycl.so")
set(SYCL_FLAGS "-fsycl"
"-fsycl-targets=nvptx64-nvidia-cuda-sycldevice,spir64-unknown-linux-sycldevice"
"-fsycl-targets=nvptx64-nvidia-cuda-sycldevice"
"-fsycl-unnamed-lambda")

# Build the CUDA code
Expand Down
32 changes: 32 additions & 0 deletions example-02/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is CMake 3.17 such a strong requirement?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The

find_package(CUDAToolkit)

used below simplifies the setting up of the project, but is only available on latest cmake.

project(sycl_cuda_interop LANGUAGES CXX CUDA)

find_package(CUDAToolkit)

# SYCL installation
if (NOT SYCL_ROOT)
message(FATAL_ERROR "No SYCL installation detected")
endif(NOT SYCL_ROOT)

set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/11.0.0/include/")
set(SYCL_LIB "${SYCL_ROOT}/lib/libsycl.so")
set(SYCL_FLAGS "-fsycl"
"-fsycl-targets=nvptx64-nvidia-cuda-sycldevice"
"-fsycl-unnamed-lambda")


# Build the CUDA code
add_executable(sgemm_cuda sgemm.cu)
target_compile_features(sgemm_cuda PUBLIC cxx_std_11)
set_target_properties(sgemm_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
set_property(TARGET sgemm_cuda PROPERTY BUILD_RPATH "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}")
target_link_libraries(sgemm_cuda CUDA::toolkit CUDA::cublas)

# Build the SYCL code
add_executable (sycl_sgemm sycl_sgemm.cpp)
target_compile_features(sycl_sgemm PUBLIC cxx_std_17)
target_compile_options(sycl_sgemm PUBLIC ${SYCL_FLAGS})
target_compile_definitions(sycl_sgemm PUBLIC CUDA_NO_HALF)
target_link_libraries(sycl_sgemm PUBLIC ${SYCL_FLAGS})
target_include_directories(sycl_sgemm PUBLIC ${SYCL_INCLUDE_DIR} ${CUDA_INCLUDE_DIRS})
target_link_libraries(sycl_sgemm PUBLIC CUDA::toolkit CUDA::cublas)
46 changes: 46 additions & 0 deletions example-02/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
SYCL interop with CUDA library
-------------------------------

The example shows how to interop with CUBLAS from a SYCL for CUDA application.
The example uses Codeplay's extension *interop_task* to call the **SGEMM**
routine in CUBLAS. Parameters are extracted using the interop handler conversion.

Requirements
==============

Requires CMake 3.17 to configure (makes use of FindCUDAToolkit for simplicity)
Example is meant to be build and executed with DPC++ compiler.


Building the example
=====================


Create a build directory and run the following command:

```
CXX=/path/to/dpc++/bin/clang++ cmake build/
```

If NVIDIA CUDA is installed in your system, CMake should be able to generate
the configuration files.

Then run

```
make
```

to build the example

Example
=========

Two source codes are provided. `sgemm.cu` is the original CUDA code calling
CUBLAS library to perform the matrix multiplication.
`sycl_sgemm.cpp` is the sycl variant that calls CUBLAS underneath.

Both implementations perform the multiplication of square matrices A and B,
where A is a matrix full of ones, and B is an identity matrix.
The expected output on C is a matrix full of ones.

80 changes: 80 additions & 0 deletions example-02/sgemm.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#include <algorithm>
#include <iostream>
#include <vector>

#include <cublas_v2.h>
#include <cuda.h>

#define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC)

void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) {
if (status != CUBLAS_STATUS_SUCCESS) {
std::cout << msg << " - " << status << std::endl;
exit(EXIT_FAILURE);
}
}

void inline checkCudaErrorMsg(cudaError status, const char *msg) {
if (status != CUDA_SUCCESS) {
std::cout << msg << " - " << status << std::endl;
exit(EXIT_FAILURE);
}
}

int main() {
constexpr size_t WIDTH = 1024;
constexpr size_t HEIGHT = 1024;
constexpr float ALPHA = 1.0f;
constexpr float BETA = 0.0f;

std::vector<float> h_A(WIDTH * HEIGHT), h_B(WIDTH * HEIGHT),
h_C(WIDTH * HEIGHT);

std::cout << "Size: " << h_C.size() << std::endl;
float *d_A, *d_B, *d_C;

// A is an identity matrix
std::fill(std::begin(h_A), std::end(h_A), 0.0f);
for (size_t i = 0; i < WIDTH; i++) {
h_A[i * WIDTH + i] = 1.0f;
}

// B is a matrix fill with 1
std::fill(std::begin(h_B), std::end(h_B), 1.0f);

const size_t numBytes = WIDTH * HEIGHT * sizeof(float);

CHECK_ERROR(cudaMalloc((void **)&d_A, numBytes));
CHECK_ERROR(cudaMalloc((void **)&d_B, numBytes));
CHECK_ERROR(cudaMalloc((void **)&d_C, numBytes));

CHECK_ERROR(cudaMemcpy(d_A, h_A.data(), numBytes, cudaMemcpyHostToDevice));
CHECK_ERROR(cudaMemcpy(d_B, h_B.data(), numBytes, cudaMemcpyHostToDevice));

cublasHandle_t handle;
CHECK_ERROR(cublasCreate(&handle));

// C = A * B
CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
WIDTH, &ALPHA, d_A, WIDTH, d_B, WIDTH, &BETA, d_C,
WIDTH));

CHECK_ERROR(cudaMemcpy(h_C.data(), d_C, numBytes, cudaMemcpyDeviceToHost));

// C must be all ones
const bool allEqual = std::all_of(std::begin(h_C), std::end(h_C),
[](float num) { return num == 1; });

if (!allEqual) {
std::cout << " Incorrect result " << std::endl;
} else {
std::cout << " Correct! " << std::endl;
}

CHECK_ERROR(cublasDestroy(handle));
CHECK_ERROR(cudaFree(d_A));
CHECK_ERROR(cudaFree(d_B));
CHECK_ERROR(cudaFree(d_C));

return allEqual ? EXIT_SUCCESS : EXIT_FAILURE;
}
114 changes: 114 additions & 0 deletions example-02/sycl_sgemm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#include <algorithm>
#include <iostream>
#include <vector>

#include <CL/sycl.hpp>
#include <CL/sycl/backend/cuda.hpp>

#include <cublas_v2.h>
#include <cuda.h>

#define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC)

void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) {
if (status != CUBLAS_STATUS_SUCCESS) {
std::cout << msg << " - " << status << std::endl;
exit(EXIT_FAILURE);
}
}

void inline checkCudaErrorMsg(cudaError status, const char *msg) {
if (status != CUDA_SUCCESS) {
std::cout << msg << " - " << status << std::endl;
exit(EXIT_FAILURE);
}
}

class CUDASelector : public sycl::device_selector {
public:
int operator()(const sycl::device &Device) const override {
using namespace sycl::info;

const std::string DriverVersion = Device.get_info<device::driver_version>();

if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
std::cout << " CUDA device found " << std::endl;
return 1;
};
return -1;
}
};

int main() {
using namespace sycl;

constexpr size_t WIDTH = 1024;
constexpr size_t HEIGHT = 1024;
constexpr float ALPHA = 1.0f;
constexpr float BETA = 0.0f;

std::vector<float> h_A(WIDTH * HEIGHT), h_B(WIDTH * HEIGHT),
h_C(WIDTH * HEIGHT);

std::cout << "Size: " << h_C.size() << std::endl;
float *d_A, *d_B, *d_C;

// A is an identity matrix
std::fill(std::begin(h_A), std::end(h_A), 0.0f);
for (size_t i = 0; i < WIDTH; i++) {
h_A[i * WIDTH + i] = 1.0f;
}

// B is a matrix fill with 1
std::fill(std::begin(h_B), std::end(h_B), 1.0f);

sycl::queue q{CUDASelector()};

cublasHandle_t handle;
CHECK_ERROR(cublasCreate(&handle));

{
buffer<float, 2> b_A{h_A.data(), range<2>{WIDTH, HEIGHT}};
buffer<float, 2> b_B{h_B.data(), range<2>{WIDTH, HEIGHT}};
buffer<float, 2> b_C{h_C.data(), range<2>{WIDTH, HEIGHT}};

q.submit([&](handler &h) {
auto d_A = b_A.get_access<sycl::access::mode::read>(h);
auto d_B = b_B.get_access<sycl::access::mode::read>(h);
auto d_C = b_C.get_access<sycl::access::mode::write>(h);

h.interop_task([=](sycl::interop_handler ih) {
cublasSetStream(handle, ih.get_queue<backend::cuda>());

auto cuA = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_A));
auto cuB = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_B));
auto cuC = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_C));

CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
WIDTH, &ALPHA, cuA, WIDTH, cuB, WIDTH, &BETA,
cuC, WIDTH));
});
});
}

// C must be all ones
int i = 0;
const bool allEqual =
std::all_of(std::begin(h_C), std::end(h_C), [&i](float num) {
++i;
if (num != 1) {
std::cout << i << " Not one : " << num << std::endl;
}
return num == 1;
});

if (!allEqual) {
std::cout << " Incorrect result " << std::endl;
} else {
std::cout << " Correct! " << std::endl;
}

CHECK_ERROR(cublasDestroy(handle));

return allEqual ? EXIT_SUCCESS : EXIT_FAILURE;
}