WIP: pop this and continue experimentation

Rust-GPU · thedodd · Nov 15, 2022 · Nov 20, 2022 · Mar 30, 2023 · Mar 30, 2023
commit 95f1066d5c07e1a99e8f71db32463a005ca98f3a
diff --git a/crates/cuda_std/src/rt/mod.rs b/crates/cuda_std/src/rt/mod.rs
@@ -35,11 +35,6 @@ pub struct Stream {
 }
 
 impl Stream {
-    // /// Creates a new stream with flags.
-    // pub fn new(flags: StreamFlags) -> Self {
-    //     Self {}
-    // }
-
     /// Creates a new stream with flags.
     pub fn new(flags: StreamFlags) -> CudaResult<Self> {
         let mut stream = MaybeUninit::uninit();
@@ -52,11 +47,10 @@ impl Stream {
         }
     }
 
-    // #[doc(hidden)]
-    // pub fn launch(&self, param_buf: *mut c_void) -> CudaResult<()> {
-    //     unsafe { cuda::cudaLaunchDeviceV2(param_buf, core::ptr::null_mut()).to_result() }
-    //     // unsafe { cuda::cudaLaunchDeviceV2(param_buf, self.raw).to_result() }
-    // }
+    #[doc(hidden)]
+    pub unsafe fn launch(&self, param_buf: *mut c_void) -> CudaResult<()> {
+        cuda::cudaLaunchDeviceV2(param_buf, self.raw).to_result()
+    }
 }
 
 impl Drop for Stream {
@@ -69,8 +63,7 @@ impl Drop for Stream {
 
 #[macro_export]
 macro_rules! launch {
-    // ($func:ident<<<$grid_dim:expr, $block_dim:expr, $smem_size:expr, $stream:ident>>>($($param:expr),* $(,)?)) => {{
-    ($func:ident<<<$grid_dim:expr, $block_dim:expr, ($smem_size:expr)>>>($($param:expr),* $(,)?)) => {{
+    ($func:ident<<<$grid_dim:expr, $block_dim:expr, $smem_size:expr, $stream:ident>>>($($param:expr),* $(,)?)) => {{
         use $crate::rt::ToResult;
         use $crate::float::GpuFloat;
         let grid_dim = $crate::rt::GridSize::from($grid_dim);
@@ -108,69 +101,12 @@ macro_rules! launch {
             offset += size;
         )*
         if false {
+            // Ensure function call compatibility at compile time.
             $func($($param),*);
         }
-        // unsafe {
-        //     let mut offset = 0;
-        //     $(
-        //         let param = $param;
-        //         let size = ::core::mem::size_of_val(&param);
-        //         let mut buf_idx = (offset as f32 / size as f32).ceil() as usize + 1;
-        //         offset = buf_idx * size;
-        //         let ptr = &param as *const _ as *const u8;
-        //         let dst = buf.add(offset);
-        //         ::core::ptr::copy_nonoverlapping(&param as *const _ as *const u8, dst, size);
-        //     )*
-        // }
-        // if false {
-        //     $func($($param),*);
-        // }
 
         // Launch the kernel.
-        $crate::rt::sys::cudaLaunchDeviceV2(buf as *mut ::core::ffi::c_void, ::core::ptr::null_mut() as *mut _)
-
-        // let mut buf = $crate::rt::sys::cudaGetParameterBuffer(alignment, size) as *mut u8;
-
-        // // Populate the buffer with given arguments.
-        // let mut offset = 0;
-        // $(
-        //     let param = $param;
-        //     let size = ::core::mem::size_of_val(&param);
-        //     let buf_bytes_ptr = (buf as *mut u8).add(offset);
-        //     ::core::ptr::copy_nonoverlapping($param as *const _, buf_bytes_ptr.into(), size);
-        //     offset += size;
-        // )*
-
-        // let mut offset = 0;
-        // $(
-        //     let param = $param;
-        //     let size = ::core::mem::size_of_val(&param);
-        //     let mut buf_idx = (offset as f32 / size as f32).ceil() as usize + 1;
-        //     offset = buf_idx * size;
-        //     let ptr = &param as *const _ as *const u8;
-        //     let dst = buf.add(offset);
-        //     ::core::ptr::copy_nonoverlapping(&param as *const _ as *const u8, dst, size);
-        // )*
-
-        // // Launch the kernel.
-        // let fptr = $func as *const ();
-        // $crate::rt::sys::cudaLaunchDevice(
-        //     fptr as *const ::core::ffi::c_void,
-        //     buf as *mut ::core::ffi::c_void,
-        //     $crate::rt::sys::dim3 {
-        //         x: grid_dim.x,
-        //         y: grid_dim.y,
-        //         z: grid_dim.z
-        //     },
-        //     $crate::rt::sys::dim3 {
-        //         x: block_dim.x,
-        //         y: block_dim.y,
-        //         z: block_dim.z
-        //     },
-        //     $smem_size,
-        //     ::core::ptr::null_mut() as *mut _,
-        //     // $stream.raw,
-        // )
+        $stream.launch(buf)
     }};
 }
 

diff --git a/crates/cust/Cargo.toml b/crates/cust/Cargo.toml
@@ -22,6 +22,7 @@ mint = { version = "^0.5", optional = true }
 num-complex = { version = "0.4", optional = true }
 vek = { version = "0.15.1", optional = true, default-features = false }
 bytemuck = { version = "1.7.3", optional = true }
+find_cuda_helper = { path = "../find_cuda_helper", version = "0.2" }
 
 [features]
 default= ["bytemuck"]

diff --git a/crates/cust/src/link.rs b/crates/cust/src/link.rs
@@ -3,9 +3,9 @@
 use std::mem::MaybeUninit;
 use std::ptr::null_mut;
 
+use crate::error::{CudaError, CudaResult, ToResult};
 use crate::sys as cuda;
-
-use crate::error::{CudaResult, ToResult};
+use find_cuda_helper::find_lib_cudadevrt;
 
 static UNNAMED: &str = "\0";
 
@@ -25,6 +25,12 @@ impl Linker {
         // Therefore we use box to alloc the memory for us, then into_raw it so we now have ownership
         // of the memory (and dont have any aliasing requirements attached either).
 
+        // // Just take advantage of C memory model and just pass individual elements, as there is only 1.
+        // let num_options: u32 = 1;
+        // let opt = &mut cuda::CUjit_option::CU_JIT_TARGET as *mut _;
+        // let mut opt_val =
+        //     &mut cuda::CUjit_target::CU_TARGET_COMPUTE_75 as *mut _ as *mut ::std::os::raw::c_void;
+
         unsafe {
             let mut raw = MaybeUninit::uninit();
             cuda::cuLinkCreate_v2(0, null_mut(), null_mut(), raw.as_mut_ptr()).to_result()?;
@@ -116,15 +122,15 @@ impl Linker {
 
     /// Link device runtime lib.
     pub fn add_libcudadevrt(&mut self) -> CudaResult<()> {
-        let mut bytes = std::fs::read("/usr/local/cuda-11/lib64/libcudadevrt.a")
-            .expect("could not read libcudadevrt.a");
+        let path = find_lib_cudadevrt().ok_or_else(|| CudaError::FileNotFound)?;
+        let mut bytes = std::fs::read(path)
+            // TODO: don't panic, update the result type instead.
+            .expect("error linking libcudadevrt.a");
 
         unsafe {
             cuda::cuLinkAddData_v2(
                 self.raw,
                 cuda::CUjitInputType::CU_JIT_INPUT_LIBRARY,
-                // cuda_sys wants *mut but from the API docs we know we retain ownership so
-                // this cast is sound.
                 bytes.as_mut_ptr() as *mut _,
                 bytes.len(),
                 UNNAMED.as_ptr().cast(),

diff --git a/crates/find_cuda_helper/src/lib.rs b/crates/find_cuda_helper/src/lib.rs
@@ -150,6 +150,16 @@ pub fn find_cuda_lib_dirs() -> Vec<PathBuf> {
     valid_paths
 }
 
+/// Find the location of `libcudadevrt.a`.
+pub fn find_lib_cudadevrt() -> Option<PathBuf> {
+    let root = find_cuda_root()?;
+    let lib = root.join("lib64").join("libcudadevrt.a");
+    if lib.is_file() {
+        return Some(lib);
+    }
+    None
+}
+
 #[cfg(target_os = "windows")]
 pub fn find_optix_root() -> Option<PathBuf> {
     // the optix SDK installer sets OPTIX_ROOT_DIR whenever it installs.