-
Notifications
You must be signed in to change notification settings - Fork 25
[SYCL] Initial commit of interop example #2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
cmake_minimum_required(VERSION 3.17 FATAL_ERROR) | ||
project(sycl_cuda_interop LANGUAGES CXX CUDA) | ||
|
||
find_package(CUDAToolkit) | ||
|
||
# SYCL installation | ||
if (NOT SYCL_ROOT) | ||
message(FATAL_ERROR "No SYCL installation detected") | ||
endif(NOT SYCL_ROOT) | ||
|
||
set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/11.0.0/include/") | ||
set(SYCL_LIB "${SYCL_ROOT}/lib/libsycl.so") | ||
set(SYCL_FLAGS "-fsycl" | ||
"-fsycl-targets=nvptx64-nvidia-cuda-sycldevice" | ||
"-fsycl-unnamed-lambda") | ||
|
||
|
||
# Build the CUDA code | ||
add_executable(sgemm_cuda sgemm.cu) | ||
target_compile_features(sgemm_cuda PUBLIC cxx_std_11) | ||
set_target_properties(sgemm_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON) | ||
set_property(TARGET sgemm_cuda PROPERTY BUILD_RPATH "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}") | ||
target_link_libraries(sgemm_cuda CUDA::toolkit CUDA::cublas) | ||
|
||
# Build the SYCL code | ||
add_executable (sycl_sgemm sycl_sgemm.cpp) | ||
target_compile_features(sycl_sgemm PUBLIC cxx_std_17) | ||
target_compile_options(sycl_sgemm PUBLIC ${SYCL_FLAGS}) | ||
target_compile_definitions(sycl_sgemm PUBLIC CUDA_NO_HALF) | ||
target_link_libraries(sycl_sgemm PUBLIC ${SYCL_FLAGS}) | ||
target_include_directories(sycl_sgemm PUBLIC ${SYCL_INCLUDE_DIR} ${CUDA_INCLUDE_DIRS}) | ||
target_link_libraries(sycl_sgemm PUBLIC CUDA::toolkit CUDA::cublas) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
SYCL interop with CUDA library | ||
------------------------------- | ||
|
||
The example shows how to interop with CUBLAS from a SYCL for CUDA application. | ||
The example uses Codeplay's extension *interop_task* to call the **SGEMM** | ||
routine in CUBLAS. Parameters are extracted using the interop handler conversion. | ||
|
||
Requirements | ||
============== | ||
|
||
Requires CMake 3.17 to configure (makes use of FindCUDAToolkit for simplicity) | ||
Example is meant to be build and executed with DPC++ compiler. | ||
|
||
|
||
Building the example | ||
===================== | ||
|
||
|
||
Create a build directory and run the following command: | ||
|
||
``` | ||
CXX=/path/to/dpc++/bin/clang++ cmake build/ | ||
``` | ||
|
||
If NVIDIA CUDA is installed in your system, CMake should be able to generate | ||
the configuration files. | ||
|
||
Then run | ||
|
||
``` | ||
make | ||
``` | ||
|
||
to build the example | ||
|
||
Example | ||
========= | ||
|
||
Two source codes are provided. `sgemm.cu` is the original CUDA code calling | ||
CUBLAS library to perform the matrix multiplication. | ||
`sycl_sgemm.cpp` is the sycl variant that calls CUBLAS underneath. | ||
|
||
Both implementations perform the multiplication of square matrices A and B, | ||
where A is a matrix full of ones, and B is an identity matrix. | ||
The expected output on C is a matrix full of ones. | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
#include <algorithm> | ||
#include <iostream> | ||
#include <vector> | ||
|
||
#include <cublas_v2.h> | ||
#include <cuda.h> | ||
|
||
#define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC) | ||
|
||
void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) { | ||
if (status != CUBLAS_STATUS_SUCCESS) { | ||
std::cout << msg << " - " << status << std::endl; | ||
exit(EXIT_FAILURE); | ||
} | ||
} | ||
|
||
void inline checkCudaErrorMsg(cudaError status, const char *msg) { | ||
if (status != CUDA_SUCCESS) { | ||
std::cout << msg << " - " << status << std::endl; | ||
exit(EXIT_FAILURE); | ||
} | ||
} | ||
|
||
int main() { | ||
constexpr size_t WIDTH = 1024; | ||
constexpr size_t HEIGHT = 1024; | ||
constexpr float ALPHA = 1.0f; | ||
constexpr float BETA = 0.0f; | ||
|
||
std::vector<float> h_A(WIDTH * HEIGHT), h_B(WIDTH * HEIGHT), | ||
h_C(WIDTH * HEIGHT); | ||
|
||
std::cout << "Size: " << h_C.size() << std::endl; | ||
float *d_A, *d_B, *d_C; | ||
|
||
// A is an identity matrix | ||
std::fill(std::begin(h_A), std::end(h_A), 0.0f); | ||
for (size_t i = 0; i < WIDTH; i++) { | ||
h_A[i * WIDTH + i] = 1.0f; | ||
} | ||
|
||
// B is a matrix fill with 1 | ||
std::fill(std::begin(h_B), std::end(h_B), 1.0f); | ||
|
||
const size_t numBytes = WIDTH * HEIGHT * sizeof(float); | ||
|
||
CHECK_ERROR(cudaMalloc((void **)&d_A, numBytes)); | ||
CHECK_ERROR(cudaMalloc((void **)&d_B, numBytes)); | ||
CHECK_ERROR(cudaMalloc((void **)&d_C, numBytes)); | ||
|
||
CHECK_ERROR(cudaMemcpy(d_A, h_A.data(), numBytes, cudaMemcpyHostToDevice)); | ||
CHECK_ERROR(cudaMemcpy(d_B, h_B.data(), numBytes, cudaMemcpyHostToDevice)); | ||
|
||
cublasHandle_t handle; | ||
CHECK_ERROR(cublasCreate(&handle)); | ||
|
||
// C = A * B | ||
CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT, | ||
WIDTH, &ALPHA, d_A, WIDTH, d_B, WIDTH, &BETA, d_C, | ||
WIDTH)); | ||
|
||
CHECK_ERROR(cudaMemcpy(h_C.data(), d_C, numBytes, cudaMemcpyDeviceToHost)); | ||
|
||
// C must be all ones | ||
const bool allEqual = std::all_of(std::begin(h_C), std::end(h_C), | ||
[](float num) { return num == 1; }); | ||
|
||
if (!allEqual) { | ||
std::cout << " Incorrect result " << std::endl; | ||
} else { | ||
std::cout << " Correct! " << std::endl; | ||
} | ||
|
||
CHECK_ERROR(cublasDestroy(handle)); | ||
CHECK_ERROR(cudaFree(d_A)); | ||
CHECK_ERROR(cudaFree(d_B)); | ||
CHECK_ERROR(cudaFree(d_C)); | ||
|
||
return allEqual ? EXIT_SUCCESS : EXIT_FAILURE; | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
#include <algorithm> | ||
#include <iostream> | ||
#include <vector> | ||
|
||
#include <CL/sycl.hpp> | ||
#include <CL/sycl/backend/cuda.hpp> | ||
|
||
#include <cublas_v2.h> | ||
#include <cuda.h> | ||
|
||
#define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC) | ||
|
||
void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) { | ||
if (status != CUBLAS_STATUS_SUCCESS) { | ||
std::cout << msg << " - " << status << std::endl; | ||
exit(EXIT_FAILURE); | ||
} | ||
} | ||
|
||
void inline checkCudaErrorMsg(cudaError status, const char *msg) { | ||
if (status != CUDA_SUCCESS) { | ||
std::cout << msg << " - " << status << std::endl; | ||
exit(EXIT_FAILURE); | ||
} | ||
} | ||
|
||
class CUDASelector : public sycl::device_selector { | ||
public: | ||
int operator()(const sycl::device &Device) const override { | ||
using namespace sycl::info; | ||
|
||
const std::string DriverVersion = Device.get_info<device::driver_version>(); | ||
|
||
if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) { | ||
std::cout << " CUDA device found " << std::endl; | ||
return 1; | ||
}; | ||
return -1; | ||
} | ||
}; | ||
|
||
int main() { | ||
using namespace sycl; | ||
|
||
constexpr size_t WIDTH = 1024; | ||
constexpr size_t HEIGHT = 1024; | ||
constexpr float ALPHA = 1.0f; | ||
constexpr float BETA = 0.0f; | ||
|
||
std::vector<float> h_A(WIDTH * HEIGHT), h_B(WIDTH * HEIGHT), | ||
h_C(WIDTH * HEIGHT); | ||
|
||
std::cout << "Size: " << h_C.size() << std::endl; | ||
float *d_A, *d_B, *d_C; | ||
|
||
// A is an identity matrix | ||
std::fill(std::begin(h_A), std::end(h_A), 0.0f); | ||
for (size_t i = 0; i < WIDTH; i++) { | ||
h_A[i * WIDTH + i] = 1.0f; | ||
} | ||
|
||
// B is a matrix fill with 1 | ||
std::fill(std::begin(h_B), std::end(h_B), 1.0f); | ||
|
||
sycl::queue q{CUDASelector()}; | ||
|
||
cublasHandle_t handle; | ||
CHECK_ERROR(cublasCreate(&handle)); | ||
|
||
{ | ||
buffer<float, 2> b_A{h_A.data(), range<2>{WIDTH, HEIGHT}}; | ||
buffer<float, 2> b_B{h_B.data(), range<2>{WIDTH, HEIGHT}}; | ||
buffer<float, 2> b_C{h_C.data(), range<2>{WIDTH, HEIGHT}}; | ||
|
||
q.submit([&](handler &h) { | ||
auto d_A = b_A.get_access<sycl::access::mode::read>(h); | ||
auto d_B = b_B.get_access<sycl::access::mode::read>(h); | ||
auto d_C = b_C.get_access<sycl::access::mode::write>(h); | ||
|
||
h.interop_task([=](sycl::interop_handler ih) { | ||
cublasSetStream(handle, ih.get_queue<backend::cuda>()); | ||
|
||
auto cuA = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_A)); | ||
auto cuB = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_B)); | ||
auto cuC = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_C)); | ||
|
||
CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT, | ||
WIDTH, &ALPHA, cuA, WIDTH, cuB, WIDTH, &BETA, | ||
cuC, WIDTH)); | ||
}); | ||
}); | ||
} | ||
|
||
// C must be all ones | ||
int i = 0; | ||
const bool allEqual = | ||
std::all_of(std::begin(h_C), std::end(h_C), [&i](float num) { | ||
++i; | ||
if (num != 1) { | ||
std::cout << i << " Not one : " << num << std::endl; | ||
Ruyk marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
return num == 1; | ||
}); | ||
|
||
if (!allEqual) { | ||
std::cout << " Incorrect result " << std::endl; | ||
} else { | ||
std::cout << " Correct! " << std::endl; | ||
} | ||
|
||
CHECK_ERROR(cublasDestroy(handle)); | ||
|
||
return allEqual ? EXIT_SUCCESS : EXIT_FAILURE; | ||
} |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is CMake 3.17 such a strong requirement?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
used below simplifies the setting up of the project, but is only available on latest cmake.