Mixing CUDA Fortran and SYCL CUDA (#16)

Ruyk · web-flow · commit e58b05f55e3d · 2021-06-30T10:08:36.000+01:00
* Mixing CUDA Fortran and SYCL CUDA

Example showing how to link together a program written in CUDA FORTRAN
that calls a routine that contains a SYCL program running on CUDA

* Fixed: SYCL code does a proper SAXPY operation

* Adding README

* Using default stream for synchronization
diff --git a/example-06/Makefile b/example-06/Makefile
@@ -0,0 +1,22 @@
+CXX=clang++
+FORT=nvfortran
+FFLAGS=-c++libs -cuda  
+CXXFLAGS=-fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -fsycl-unnamed-lambda 
+DPCPP_PATH=/home/ruyman/sycl_workspace/build_dpcpp/install
+
+default: final.exe
+
+saxpy_sycl.so: saxpy.cpp
+	$(CXX) $(CXXFLAGS) -fPIC --shared saxpy.cpp -o saxpy_sycl.so
+
+saxpy_cuf.o: saxpy.cuf
+	$(FORT) $(FFLAGS) -c saxpy.cuf -o saxpy_cuf.o
+       
+final.exe: saxpy_cuf.o saxpy_sycl.so
+	$(FORT) $(FFLAGS) -o final.exe saxpy_cuf.o saxpy_sycl.so -L${DPCPP_PATH}/lib/ -lsycl 
+
+.PHONY: clean
+
+clean: 
+	rm -f saxpy_cuf.o saxpy_sycl.so final.exe mathops.mod
+
diff --git a/example-06/README.md b/example-06/README.md
@@ -0,0 +1,36 @@
+CUDA Frotran and SYCL integration
+======================================
+
+This directory shows an example of how to call a SYCL function
+from a CUDA fortran code.
+
+The SYCL routine is called using the Fortran ISO bindings like
+any other C function.
+
+```fortran
+interface saxpy_sycl
+  subroutine saxpy_call(x, y, a, N) &
+    bind(C,name='saxpy_sycl_cuda_wrapper')
+    implicit none
+    real :: x(:), y(:)
+    real, value :: a
+    integer, value :: N
+  end subroutine
+end interface
+```
+
+The SYCL code implemented in the C++ version of the code works as usual with one minor modification:
+Uses the CUDA Primary context to enable inter-operating with the CUDA Fortran code, ensuring the same resources are shared.
+
+The following snipped highligts the construction of a SYCL context associated with the Primary context.
+To ensure synchronization with the CUDA Fortran code, the queue will also be mapped to the default CUDA
+stream, instead of creating a new stream.
+It is possible to create a normal stream,  just by using the default SYCL queue constructor on the CUDA
+context. Said queue will run concurrently (i.e. won't sync) to the main queue.
+
+```cpp
+sycl::context c{sycl::property::context::cuda::use_primary_context()};
+sycl::queue q{c, c.get_devices()[0], sycl::property::queue::cuda::use_default_stream()};
+```
+
+
diff --git a/example-06/saxpy.cpp b/example-06/saxpy.cpp
@@ -0,0 +1,28 @@
+#include <iostream>
+#include <CL/sycl.hpp>
+
+extern "C" {
+   void saxpy_sycl_cuda_wrapper (float* x, float* y, float a, int N);
+};
+
+
+void saxpy_sycl_cuda_wrapper (float* x, float* y, float a, int N) {
+   sycl::context c{sycl::property::context::cuda::use_primary_context()};
+   sycl::queue q{c, c.get_devices()[0], sycl::property::queue::cuda::use_default_stream()};
+   {
+      sycl::buffer bX {x, sycl::range<1>(N)};
+      sycl::buffer bY {y, sycl::range<1>(N)};
+
+      q.submit([&](sycl::handler& h) {
+         auto aX = bX.get_access<sycl::access::mode::read_write>(h);
+         auto aY = bY.get_access<sycl::access::mode::read_write>(h);
+         h.parallel_for<class saxpy_kernel>(sycl::range<1>(N), [=](sycl::id<1> id) {
+            if (id[0] < N) 
+               aY[id] = aX[id] * a + aY[id];
+         });
+      });
+
+      q.wait_and_throw();
+   }
+   return;
+}
diff --git a/example-06/saxpy.cuf b/example-06/saxpy.cuf
@@ -0,0 +1,53 @@
+module mathOps
+contains
+  attributes(global) subroutine saxpy(x, y, a)
+    implicit none
+    real :: x(:), y(:)
+    real, value :: a
+    integer :: i, n
+    n = size(x)
+    i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
+    if (i <= n) y(i) = y(i) + a*x(i)
+  end subroutine saxpy 
+end module mathOps
+
+program testSaxpy
+  use mathOps
+  use cudafor
+
+  implicit none
+
+interface saxpy_sycl
+  subroutine saxpy_call(x, y, a, N) &
+    bind(C,name='saxpy_sycl_cuda_wrapper')
+    implicit none
+    real :: x(:), y(:)
+    real, value :: a
+    integer, value :: N
+  end subroutine
+end interface
+
+
+  integer, parameter :: N = 1024
+  real :: x(N), y(N), a
+  real, device :: x_d(N), y_d(N)
+  type(dim3) :: grid, tBlock
+
+  tBlock = dim3(256,1,1)
+  grid = dim3(ceiling(real(N)/tBlock%x),1,1)
+
+  write (*,*) 'CUDA version: '
+  x = 1.0; y = 2.0; a = 2.0
+  x_d = x
+  y_d = y
+  call saxpy<<<grid, tBlock>>>(x_d, y_d, a)
+  y = y_d
+  write(*,*) 'Max error: ', maxval(abs(y-4.0))
+  write(*,*) 'N ', N
+
+  write (*,*) 'SYCL version: '
+  y = 2.0;
+  call saxpy_call(x, y, a, N);
+  write(*,*) 'Max error: ', maxval(abs(y-4.0))
+
+end program testSaxpy