codeplaysoftware · Ruyk · Apr 19, 2023 · Feb 9, 2023 · Feb 9, 2023 · Feb 13, 2023
diff --git a/.gitignore b/.gitignore
@@ -34,4 +34,7 @@
 # Temporaries
 *~
 *#
-*/build
+*/build
+]
+# vim
+*.swp
diff --git a/examples/sgemm_interop/sycl_sgemm.cpp b/examples/sgemm_interop/sycl_sgemm.cpp
@@ -25,8 +25,7 @@
 #include <iostream>
 #include <vector>
 
-#include <CL/sycl.hpp>
-#include <CL/sycl/backend/cuda.hpp>
+#include <sycl/sycl.hpp>
 
 #include <cublas_v2.h>
 #include <cuda.h>
@@ -47,25 +46,6 @@ void inline checkCudaErrorMsg(cudaError status, const char *msg) {
   }
 }
 
-void inline checkCudaErrorMsg(CUresult status, const char *msg) {
-  if (status != CUDA_SUCCESS) {
-    std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl;
-    exit(EXIT_FAILURE);
-  }
-}
-
-class CUDASelector : public sycl::device_selector {
-public:
-  int operator()(const sycl::device &device) const override {
-    if(device.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda){
-      std::cout << " CUDA device found " << std::endl;
-      return 1;
-    } else{
-      return -1;
-    }
-  }
-};
-
 int main() {
   using namespace sycl;
 
@@ -88,7 +68,9 @@ int main() {
   // B is a matrix fill with 1
   std::fill(std::begin(h_B), std::end(h_B), 1.0f);
 
-  sycl::queue q{CUDASelector()};
+  sycl::queue q{[](auto &d) {
+    return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda);
+  }};
 
   cublasHandle_t handle;
   CHECK_ERROR(cublasCreate(&handle));
@@ -104,12 +86,16 @@ int main() {
       auto d_C = b_C.get_access<sycl::access::mode::write>(h);
 
       h.host_task([=](sycl::interop_handle ih) {
-        cuCtxSetCurrent(ih.get_native_context<backend::ext_oneapi_cuda>());
+        // Set the correct cuda context & stream
+	cuCtxSetCurrent(ih.get_native_context<backend::ext_oneapi_cuda>());
         auto cuStream = ih.get_native_queue<backend::ext_oneapi_cuda>();
         cublasSetStream(handle, cuStream);
-        auto cuA = reinterpret_cast<float *>(ih.get_native_mem<backend::ext_oneapi_cuda>(d_A));
-        auto cuB = reinterpret_cast<float *>(ih.get_native_mem<backend::ext_oneapi_cuda>(d_B));
-        auto cuC = reinterpret_cast<float *>(ih.get_native_mem<backend::ext_oneapi_cuda>(d_C));
+        auto cuA = reinterpret_cast<float *>(
+            ih.get_native_mem<backend::ext_oneapi_cuda>(d_A));
+        auto cuB = reinterpret_cast<float *>(
+            ih.get_native_mem<backend::ext_oneapi_cuda>(d_B));
+        auto cuC = reinterpret_cast<float *>(
+            ih.get_native_mem<backend::ext_oneapi_cuda>(d_C));
 
         CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
                                 WIDTH, &ALPHA, cuA, WIDTH, cuB, WIDTH, &BETA,

diff --git a/examples/sgemm_interop/sycl_sgemm_usm.cpp b/examples/sgemm_interop/sycl_sgemm_usm.cpp
@@ -25,8 +25,7 @@
 #include <iostream>
 #include <vector>
 
-#include <CL/sycl.hpp>
-#include <CL/sycl/backend/cuda.hpp>
+#include <sycl/sycl.hpp>
 
 #include <cublas_v2.h>
 #include <cuda.h>
@@ -47,25 +46,6 @@ void inline checkCudaErrorMsg(cudaError status, const char *msg) {
   }
 }
 
-void inline checkCudaErrorMsg(CUresult status, const char *msg) {
-  if (status != CUDA_SUCCESS) {
-    std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl;
-    exit(EXIT_FAILURE);
-  }
-}
-
-class CUDASelector : public sycl::device_selector {
-public:
-  int operator()(const sycl::device &device) const override {
-    if(device.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda){
-      std::cout << " CUDA device found " << std::endl;
-      return 1;
-    } else{
-      return -1;
-    }
-  }
-};
-
 int main() {
   using namespace sycl;
 
@@ -88,12 +68,14 @@ int main() {
   // B is a matrix fill with 1
   std::fill(std::begin(h_B), std::end(h_B), 1.0f);
 
-  sycl::queue q{CUDASelector()};
+  sycl::queue q{[](auto &d) {
+    return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda);
+  }};
 
   // Allocate memory on the device
-  float* d_A = sycl::malloc_device<float>(WIDTH*HEIGHT,q);
-  float* d_B = sycl::malloc_device<float>(WIDTH*HEIGHT,q);
-  float* d_C = sycl::malloc_device<float>(WIDTH*HEIGHT,q);
+  float *d_A = sycl::malloc_device<float>(WIDTH * HEIGHT, q);
+  float *d_B = sycl::malloc_device<float>(WIDTH * HEIGHT, q);
+  float *d_C = sycl::malloc_device<float>(WIDTH * HEIGHT, q);
 
   // Copy matrices A & B to device from host vectors
   const size_t numBytes = WIDTH * HEIGHT * sizeof(float);
@@ -105,21 +87,19 @@ int main() {
   CHECK_ERROR(cublasCreate(&handle));
 
   q.submit([&](handler &h) {
-
-    h.host_task([=](sycl::interop_handle ih) {
-
-      // Set the correct cuda context & stream
-      cuCtxSetCurrent(ih.get_native_context<backend::ext_oneapi_cuda>());
-      auto cuStream = ih.get_native_queue<backend::ext_oneapi_cuda>();
-      cublasSetStream(handle, cuStream);
-
-      // Call generalised matrix-matrix multiply
-      CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
-                              WIDTH, &ALPHA, d_A, WIDTH, d_B, WIDTH, &BETA,
-                              d_C, WIDTH));
-      cuStreamSynchronize(cuStream);
-    });
-  }).wait();
+     h.host_task([=](sycl::interop_handle ih) {
+       // Set the correct cuda context & stream
+       cuCtxSetCurrent(ih.get_native_context<backend::ext_oneapi_cuda>());
+       auto cuStream = ih.get_native_queue<backend::ext_oneapi_cuda>();
+       cublasSetStream(handle, cuStream);
+
+       // Call generalised matrix-matrix multiply
+       CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
+                               WIDTH, &ALPHA, d_A, WIDTH, d_B, WIDTH, &BETA,
+                               d_C, WIDTH));
+       cuStreamSynchronize(cuStream);
+     });
+   }).wait();
 
   // Copy the result back to host
   q.memcpy(h_C.data(), d_C, numBytes).wait();

diff --git a/examples/vector_addition/README.md b/examples/vector_addition/README.md
@@ -7,9 +7,6 @@ to highlight how to build an application with SYCL for CUDA using DPC++ support,
 for which an example CMakefile is provided. For detailed documentation on how to
 migrate from CUDA to SYCL, see [SYCL For CUDA Developers](https://developer.codeplay.com/products/computecpp/ce/guides/sycl-for-cuda-developers).
 
-Note currently the CUDA backend does not support the [USM](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc) extension, so we use
-`sycl::buffer` and `sycl::accessors` instead.
-
 Pre-requisites
 ---------------
 

diff --git a/examples/vector_addition/vector_addition.cpp b/examples/vector_addition/vector_addition.cpp
@@ -34,10 +34,9 @@ int main(int argc, char *argv[]) {
 
   // Initialize input data
   {
-    const auto dwrite_t = sycl::access::mode::discard_write;
+    sycl::host_accessor h_a{bufA, sycl::write_only};
+    sycl::host_accessor h_b{bufB, sycl::write_only};
 
-    auto h_a = bufA.get_access<dwrite_t>();
-    auto h_b = bufB.get_access<dwrite_t>();
     for (int i = 0; i < N; i++) {
       h_a[i] = sin(i) * sin(i);
       h_b[i] = cos(i) * cos(i);
@@ -63,15 +62,14 @@ int main(int argc, char *argv[]) {
     auto b = bufB.get_access<read_t>(h);
     auto c = bufC.get_access<write_t>(h);
 
-    h.parallel_for(VecSize,
-                   [=](sycl::id<1> i) { c[i] = a[i] + b[i]; });
+    h.parallel_for(VecSize, [=](sycl::id<1> i) { c[i] = a[i] + b[i]; });
   };
 
   myQueue.submit(cg);
 
   {
-    const auto read_t = sycl::access::mode::read;
-    auto h_c = bufC.get_access<read_t>();
+    sycl::host_accessor h_c{bufC, sycl::read_only};
+
     double sum = 0.0f;
     for (int i = 0; i < N; i++) {
       sum += h_c[i];
-Original file line number
+Diff line change
@@ Expand Up / @@ -34,4 +34,7 @@ @@
     # Temporaries
     *~
     *#
-    */build
+    */build
+    ]
+    # vim
+    *.swp