File tree Expand file tree Collapse file tree 4 files changed +139
-0
lines changed Expand file tree Collapse file tree 4 files changed +139
-0
lines changed Original file line number Diff line number Diff line change
1
+ CXX =clang++
2
+ FORT =nvfortran
3
+ FFLAGS =-c++libs -cuda
4
+ CXXFLAGS =-fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -fsycl-unnamed-lambda
5
+ DPCPP_PATH =/home/ruyman/sycl_workspace/build_dpcpp/install
6
+
7
+ default : final.exe
8
+
9
+ saxpy_sycl.so : saxpy.cpp
10
+ $(CXX ) $(CXXFLAGS ) -fPIC --shared saxpy.cpp -o saxpy_sycl.so
11
+
12
+ saxpy_cuf.o : saxpy.cuf
13
+ $(FORT ) $(FFLAGS ) -c saxpy.cuf -o saxpy_cuf.o
14
+
15
+ final.exe : saxpy_cuf.o saxpy_sycl.so
16
+ $(FORT ) $(FFLAGS ) -o final.exe saxpy_cuf.o saxpy_sycl.so -L${DPCPP_PATH} /lib/ -lsycl
17
+
18
+ .PHONY : clean
19
+
20
+ clean :
21
+ rm -f saxpy_cuf.o saxpy_sycl.so final.exe mathops.mod
22
+
Original file line number Diff line number Diff line change
1
+ CUDA Frotran and SYCL integration
2
+ ======================================
3
+
4
+ This directory shows an example of how to call a SYCL function
5
+ from a CUDA fortran code.
6
+
7
+ The SYCL routine is called using the Fortran ISO bindings like
8
+ any other C function.
9
+
10
+ ``` fortran
11
+ interface saxpy_sycl
12
+ subroutine saxpy_call(x, y, a, N) &
13
+ bind(C,name='saxpy_sycl_cuda_wrapper')
14
+ implicit none
15
+ real :: x(:), y(:)
16
+ real, value :: a
17
+ integer, value :: N
18
+ end subroutine
19
+ end interface
20
+ ```
21
+
22
+ The SYCL code implemented in the C++ version of the code works as usual with one minor modification:
23
+ Uses the CUDA Primary context to enable inter-operating with the CUDA Fortran code, ensuring the same resources are shared.
24
+
25
+ The following snipped highligts the construction of a SYCL context associated with the Primary context.
26
+ To ensure synchronization with the CUDA Fortran code, the queue will also be mapped to the default CUDA
27
+ stream, instead of creating a new stream.
28
+ It is possible to create a normal stream, just by using the default SYCL queue constructor on the CUDA
29
+ context. Said queue will run concurrently (i.e. won't sync) to the main queue.
30
+
31
+ ``` cpp
32
+ sycl::context c{sycl::property::context::cuda::use_primary_context ()};
33
+ sycl::queue q{c, c.get_devices()[ 0] , sycl::property::queue::cuda::use_default_stream()};
34
+ ```
35
+
36
+
Original file line number Diff line number Diff line change
1
+ #include < iostream>
2
+ #include < CL/sycl.hpp>
3
+
4
+ extern " C" {
5
+ void saxpy_sycl_cuda_wrapper (float * x, float * y, float a, int N);
6
+ };
7
+
8
+
9
+ void saxpy_sycl_cuda_wrapper (float * x, float * y, float a, int N) {
10
+ sycl::context c{sycl::property::context::cuda::use_primary_context ()};
11
+ sycl::queue q{c, c.get_devices ()[0 ], sycl::property::queue::cuda::use_default_stream ()};
12
+ {
13
+ sycl::buffer bX {x, sycl::range<1 >(N)};
14
+ sycl::buffer bY {y, sycl::range<1 >(N)};
15
+
16
+ q.submit ([&](sycl::handler& h) {
17
+ auto aX = bX.get_access <sycl::access::mode::read_write>(h);
18
+ auto aY = bY.get_access <sycl::access::mode::read_write>(h);
19
+ h.parallel_for <class saxpy_kernel >(sycl::range<1 >(N), [=](sycl::id<1 > id) {
20
+ if (id[0 ] < N)
21
+ aY[id] = aX[id] * a + aY[id];
22
+ });
23
+ });
24
+
25
+ q.wait_and_throw ();
26
+ }
27
+ return ;
28
+ }
Original file line number Diff line number Diff line change
1
+ module mathOps
2
+ contains
3
+ attributes(global) subroutine saxpy(x, y, a)
4
+ implicit none
5
+ real :: x(:), y(:)
6
+ real, value :: a
7
+ integer :: i, n
8
+ n = size(x)
9
+ i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
10
+ if (i <= n) y(i) = y(i) + a*x(i)
11
+ end subroutine saxpy
12
+ end module mathOps
13
+
14
+ program testSaxpy
15
+ use mathOps
16
+ use cudafor
17
+
18
+ implicit none
19
+
20
+ interface saxpy_sycl
21
+ subroutine saxpy_call(x, y, a, N) &
22
+ bind(C,name='saxpy_sycl_cuda_wrapper')
23
+ implicit none
24
+ real :: x(:), y(:)
25
+ real, value :: a
26
+ integer, value :: N
27
+ end subroutine
28
+ end interface
29
+
30
+
31
+ integer, parameter :: N = 1024
32
+ real :: x(N), y(N), a
33
+ real, device :: x_d(N), y_d(N)
34
+ type(dim3) :: grid, tBlock
35
+
36
+ tBlock = dim3(256,1,1)
37
+ grid = dim3(ceiling(real(N)/tBlock%x),1,1)
38
+
39
+ write (*,*) 'CUDA version: '
40
+ x = 1.0; y = 2.0; a = 2.0
41
+ x_d = x
42
+ y_d = y
43
+ call saxpy<<<grid, tBlock>>>(x_d, y_d, a)
44
+ y = y_d
45
+ write(*,*) 'Max error: ', maxval(abs(y-4.0))
46
+ write(*,*) 'N ', N
47
+
48
+ write (*,*) 'SYCL version: '
49
+ y = 2.0;
50
+ call saxpy_call(x, y, a, N);
51
+ write(*,*) 'Max error: ', maxval(abs(y-4.0))
52
+
53
+ end program testSaxpy
You can’t perform that action at this time.
0 commit comments