Skip to content

Commit e58b05f

Browse files
authored
Mixing CUDA Fortran and SYCL CUDA (#16)
* Mixing CUDA Fortran and SYCL CUDA Example showing how to link together a program written in CUDA FORTRAN that calls a routine that contains a SYCL program running on CUDA * Fixed: SYCL code does a proper SAXPY operation * Adding README * Using default stream for synchronization
1 parent 0bef3dc commit e58b05f

File tree

4 files changed

+139
-0
lines changed

4 files changed

+139
-0
lines changed

example-06/Makefile

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
CXX=clang++
2+
FORT=nvfortran
3+
FFLAGS=-c++libs -cuda
4+
CXXFLAGS=-fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -fsycl-unnamed-lambda
5+
DPCPP_PATH=/home/ruyman/sycl_workspace/build_dpcpp/install
6+
7+
default: final.exe
8+
9+
saxpy_sycl.so: saxpy.cpp
10+
$(CXX) $(CXXFLAGS) -fPIC --shared saxpy.cpp -o saxpy_sycl.so
11+
12+
saxpy_cuf.o: saxpy.cuf
13+
$(FORT) $(FFLAGS) -c saxpy.cuf -o saxpy_cuf.o
14+
15+
final.exe: saxpy_cuf.o saxpy_sycl.so
16+
$(FORT) $(FFLAGS) -o final.exe saxpy_cuf.o saxpy_sycl.so -L${DPCPP_PATH}/lib/ -lsycl
17+
18+
.PHONY: clean
19+
20+
clean:
21+
rm -f saxpy_cuf.o saxpy_sycl.so final.exe mathops.mod
22+

example-06/README.md

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
CUDA Frotran and SYCL integration
2+
======================================
3+
4+
This directory shows an example of how to call a SYCL function
5+
from a CUDA fortran code.
6+
7+
The SYCL routine is called using the Fortran ISO bindings like
8+
any other C function.
9+
10+
```fortran
11+
interface saxpy_sycl
12+
subroutine saxpy_call(x, y, a, N) &
13+
bind(C,name='saxpy_sycl_cuda_wrapper')
14+
implicit none
15+
real :: x(:), y(:)
16+
real, value :: a
17+
integer, value :: N
18+
end subroutine
19+
end interface
20+
```
21+
22+
The SYCL code implemented in the C++ version of the code works as usual with one minor modification:
23+
Uses the CUDA Primary context to enable inter-operating with the CUDA Fortran code, ensuring the same resources are shared.
24+
25+
The following snipped highligts the construction of a SYCL context associated with the Primary context.
26+
To ensure synchronization with the CUDA Fortran code, the queue will also be mapped to the default CUDA
27+
stream, instead of creating a new stream.
28+
It is possible to create a normal stream, just by using the default SYCL queue constructor on the CUDA
29+
context. Said queue will run concurrently (i.e. won't sync) to the main queue.
30+
31+
```cpp
32+
sycl::context c{sycl::property::context::cuda::use_primary_context()};
33+
sycl::queue q{c, c.get_devices()[0], sycl::property::queue::cuda::use_default_stream()};
34+
```
35+
36+

example-06/saxpy.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#include <iostream>
2+
#include <CL/sycl.hpp>
3+
4+
extern "C" {
5+
void saxpy_sycl_cuda_wrapper (float* x, float* y, float a, int N);
6+
};
7+
8+
9+
void saxpy_sycl_cuda_wrapper (float* x, float* y, float a, int N) {
10+
sycl::context c{sycl::property::context::cuda::use_primary_context()};
11+
sycl::queue q{c, c.get_devices()[0], sycl::property::queue::cuda::use_default_stream()};
12+
{
13+
sycl::buffer bX {x, sycl::range<1>(N)};
14+
sycl::buffer bY {y, sycl::range<1>(N)};
15+
16+
q.submit([&](sycl::handler& h) {
17+
auto aX = bX.get_access<sycl::access::mode::read_write>(h);
18+
auto aY = bY.get_access<sycl::access::mode::read_write>(h);
19+
h.parallel_for<class saxpy_kernel>(sycl::range<1>(N), [=](sycl::id<1> id) {
20+
if (id[0] < N)
21+
aY[id] = aX[id] * a + aY[id];
22+
});
23+
});
24+
25+
q.wait_and_throw();
26+
}
27+
return;
28+
}

example-06/saxpy.cuf

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
module mathOps
2+
contains
3+
attributes(global) subroutine saxpy(x, y, a)
4+
implicit none
5+
real :: x(:), y(:)
6+
real, value :: a
7+
integer :: i, n
8+
n = size(x)
9+
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
10+
if (i <= n) y(i) = y(i) + a*x(i)
11+
end subroutine saxpy
12+
end module mathOps
13+
14+
program testSaxpy
15+
use mathOps
16+
use cudafor
17+
18+
implicit none
19+
20+
interface saxpy_sycl
21+
subroutine saxpy_call(x, y, a, N) &
22+
bind(C,name='saxpy_sycl_cuda_wrapper')
23+
implicit none
24+
real :: x(:), y(:)
25+
real, value :: a
26+
integer, value :: N
27+
end subroutine
28+
end interface
29+
30+
31+
integer, parameter :: N = 1024
32+
real :: x(N), y(N), a
33+
real, device :: x_d(N), y_d(N)
34+
type(dim3) :: grid, tBlock
35+
36+
tBlock = dim3(256,1,1)
37+
grid = dim3(ceiling(real(N)/tBlock%x),1,1)
38+
39+
write (*,*) 'CUDA version: '
40+
x = 1.0; y = 2.0; a = 2.0
41+
x_d = x
42+
y_d = y
43+
call saxpy<<<grid, tBlock>>>(x_d, y_d, a)
44+
y = y_d
45+
write(*,*) 'Max error: ', maxval(abs(y-4.0))
46+
write(*,*) 'N ', N
47+
48+
write (*,*) 'SYCL version: '
49+
y = 2.0;
50+
call saxpy_call(x, y, a, N);
51+
write(*,*) 'Max error: ', maxval(abs(y-4.0))
52+
53+
end program testSaxpy

0 commit comments

Comments
 (0)