Merge branch 'develop' into woptim/build-cache

Alpine-DAV · Jun 6, 2024 · bc99666 · bc99666
2 parents 5cc75c4 + dfaab80
commit bc99666
Show file tree

Hide file tree

Showing 24 changed files with 363 additions and 276 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -100,6 +100,7 @@ trigger-rajaperf:
     strategy: depend
 
 include:
+  # Sets ID tokens for every job using `default:`
   - project: 'lc-templates/id_tokens'
     file: 'id_tokens.yml'
   # [Optional] checks preliminary to running the actual CI test

diff --git a/.gitlab/jobs/tioga.yml b/.gitlab/jobs/tioga.yml
@@ -26,12 +26,12 @@
 # ${PROJECT_<MACHINE>_DEPS} in the extra jobs. There is no reason not to fully
 # describe the spec here.
 
-rocmcc_5_7_1_hip_desul_atomics:
+rocmcc_6_1_1_hip_desul_atomics:
   variables:
-    SPEC: "~shared +rocm ~openmp +desul +tests amdgpu_target=gfx90a %rocmcc@=5.7.1 ^hip@5.7.1 ^blt@develop"
+    SPEC: "~shared +rocm ~openmp +desul +tests amdgpu_target=gfx90a %rocmcc@=6.1.1 ^hip@6.1.1 ^blt@develop"
   extends: .job_on_tioga
 
-rocmcc_5_7_1_hip_openmp:
+rocmcc_6_1_1_hip_openmp:
   variables:
-    SPEC: "~shared +rocm +openmp +omptask +tests amdgpu_target=gfx90a %rocmcc@=5.7.1 ^hip@5.7.1 ^blt@develop"
+    SPEC: "~shared +rocm +openmp +omptask +tests amdgpu_target=gfx90a %rocmcc@=6.1.1 ^hip@6.1.1 ^blt@develop"
   extends: .job_on_tioga
diff --git a/.uberenv_config.json b/.uberenv_config.json
@@ -4,7 +4,7 @@
 "package_final_phase" : "initconfig",
 "package_source_dir" : "../..",
 "spack_url": "https://github.com/spack/spack.git",
-"spack_branch": "bugfix/invalid-compiler-warning",
+"spack_branch": "develop-2024-05-26",
 "spack_activate" : {},
 "spack_configs_path": "scripts/radiuss-spack-configs",
 "spack_packages_path": "scripts/radiuss-spack-configs/packages",

diff --git a/examples/dynamic_mat_transpose.cpp b/examples/dynamic_mat_transpose.cpp
@@ -11,7 +11,6 @@
 #include <iostream>
 
 #include "RAJA/RAJA.hpp"
-#include "memoryManager.hpp"
 
 /*
  *  Matrix Transpose Example
@@ -96,7 +95,7 @@ using outer0 = RAJA::LoopPolicy<
 #endif
 #if defined(RAJA_ENABLE_SYCL)
                                        ,
-                                       RAJA::sycl_group_0_direct
+                                       RAJA::sycl_group_2_direct
 #endif
                                        >;
 
@@ -135,7 +134,7 @@ using inner0 = RAJA::LoopPolicy<
 #endif
 #if defined(RAJA_ENABLE_SYCL)
                                         ,
-                                         RAJA::sycl_local_0_direct
+                                         RAJA::sycl_local_2_direct
 #endif
                                          >;
 
@@ -154,20 +153,9 @@ using inner1 = RAJA::LoopPolicy<RAJA::seq_exec
 #endif
                                          >;
 
-template<typename T>
-void switch_ptrs(T *A, T *d_A)
-{
-  T *tmp_ptr;
-  tmp_ptr = d_A;
-  d_A = A;
-  A = tmp_ptr;
-}
-
 int main(int argc, char *argv[])
 {
 
-  std::cout << "\n\nRAJA matrix transpose example...\n";
-
   if(argc != 2) {
     RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device");
   }
@@ -185,17 +173,26 @@ int main(int argc, char *argv[])
 
   RAJA::ExecPlace select_cpu_or_gpu;
   if(exec_space.compare("host") == 0)
-    { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA::launch reductions example on the host \n"); }
+    { select_cpu_or_gpu = RAJA::ExecPlace::HOST; std::cout<<"Running RAJA::launch matrix transpose example on the host"<<std::endl; }
   if(exec_space.compare("device") == 0)
-    { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA::launch reductions example on the device \n"); }
-
-
+    { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; std::cout<<"Running RAJA::launch matrix transpose example on the device" <<std::endl; }
 
+  RAJA::resources::Host host_res;
+#if defined(RAJA_ENABLE_CUDA)
+  RAJA::resources::Cuda device_res;
+#endif
+#if defined(RAJA_ENABLE_HIP)
+  RAJA::resources::Hip device_res;
+#endif
 #if defined(RAJA_ENABLE_SYCL)
-  memoryManager::sycl_res = new camp::resources::Resource{camp::resources::Sycl()};
-  ::RAJA::sycl::detail::setQueue(memoryManager::sycl_res);
+  RAJA::resources::Sycl device_res;
 #endif
 
+#if defined(RAJA_GPU_ACTIVE)
+  RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
+#else
+  RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
+#endif
   //
   // Define num rows/cols in matrix, tile dimensions, and number of tiles
   //
@@ -212,9 +209,8 @@ int main(int argc, char *argv[])
   //
   // Allocate matrix data
   //
-  int *A = memoryManager::allocate<int>(N_r * N_c);
-  int *At = memoryManager::allocate<int>(N_r * N_c);
-
+  int *A = host_res.allocate<int>(N_r * N_c);
+  int *At = host_res.allocate<int>(N_r * N_c);
   //
   // In the following implementations of matrix transpose, we
   // use RAJA 'View' objects to access the matrix data. A RAJA view
@@ -300,20 +296,24 @@ int main(int argc, char *argv[])
 
   std::cout << "\n Running RAJA matrix transpose w/ dynamic shared memory ...\n";
 
-#if defined(RAJA_ENABLE_HIP)
+  //Reset memory
+  std::memset(At, 0, N_r * N_c * sizeof(int));
 
-  //Hip requires device side pointers
+#if defined(RAJA_GPU_ACTIVE)
+  //Allocate device side pointers
   int *d_A = nullptr, *d_At = nullptr;
 
   if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
-    d_A =  memoryManager::allocate_gpu<int>(N_r * N_c);
-    d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
-    hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+    d_A  =  device_res.allocate<int>(N_r * N_c);
+    d_At = device_res.allocate<int>(N_r * N_c);
+
+    device_res.memcpy(d_A, A, sizeof(int) * N_r * N_c);
+    device_res.memcpy(d_At, At, sizeof(int) * N_r * N_c);
 
     //switch host/device pointers so we can reuse the views
-    switch_ptrs(d_A, A);
-    switch_ptrs(d_At, At);
+    Aview.set_data(d_A);
+    Atview.set_data(d_At);
   }
 #endif
 
@@ -323,13 +323,11 @@ int main(int argc, char *argv[])
 
   // _dynamic_mattranspose_kernel_start
   RAJA::launch<launch_policy>
-    (select_cpu_or_gpu,
-     RAJA::LaunchParams(RAJA::Teams(outer_Dimr, outer_Dimc),
-                        RAJA::Threads(TILE_DIM, TILE_DIM), dynamic_shared_mem_size),
+    (res, RAJA::LaunchParams(RAJA::Teams(outer_Dimc, outer_Dimr),
+                             RAJA::Threads(TILE_DIM, TILE_DIM), dynamic_shared_mem_size),
      "Matrix tranpose with dynamic shared memory kernel",
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
   {
-
     RAJA::loop<outer1>(ctx, RAJA::RangeSegment(0, outer_Dimr), [&] (int by){
         RAJA::loop<outer0>(ctx, RAJA::RangeSegment(0, outer_Dimc), [&] (int bx){
 
@@ -378,24 +376,37 @@ int main(int argc, char *argv[])
             ctx.releaseSharedMemory();
           });
       });
-
   });
   // _dynamic_mattranspose_kernel_end
 
-
-#if defined(RAJA_ENABLE_HIP)
+#if defined(RAJA_GPU_ACTIVE)
   if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
-    switch_ptrs(d_At, At);
-    switch_ptrs(d_A, A);
 
-    hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
+    device_res.memcpy(A, d_A, sizeof(int) * N_r * N_c);
+    device_res.memcpy(At, d_At, sizeof(int) * N_r * N_c);
+
+    Aview.set_data(A);
+    Atview.set_data(At);
   }
 #endif
 
 
   checkResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
   //----------------------------------------------------------------------------//
 
+  //Release data
+  host_res.deallocate(A);
+  host_res.deallocate(At);
+
+#if defined(RAJA_GPU_ACTIVE)
+  if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
+    device_res.deallocate(d_A);
+    device_res.deallocate(d_At);
+  }
+#endif
+
+
   return 0;
 }
 

diff --git a/examples/resource-dynamic-forall.cpp b/examples/resource-dynamic-forall.cpp
@@ -110,9 +110,12 @@ int main(int argc, char *argv[])
 #if defined(RAJA_ENABLE_HIP)
   RAJA::resources::Hip device_res;
 #endif
+#if defined(RAJA_ENABLE_SYCL)
+  RAJA::resources::Sycl device_res;
+#endif  
 
   //Get typed erased resource - it will internally store if we are running on the host or device
-#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL)
   RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
 #else
   RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);

diff --git a/examples/resource-runtime-launch.cpp b/examples/resource-runtime-launch.cpp
@@ -153,7 +153,7 @@ int main(int argc, char *argv[])
 #endif
 
   //Get typed erased resource - it will internally store if we are running on the host or device
-#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+#if defined(RAJA_GPU_ACTIVE) && !defined(RAJA_ENABLE_SYCL)
   RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
 #else
   RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);

diff --git a/include/RAJA/pattern/kernel/For.hpp b/include/RAJA/pattern/kernel/For.hpp
@@ -145,4 +145,4 @@ struct StatementExecutor<
 }  // end namespace RAJA
 
 
-#endif /* RAJA_pattern_nested_HPP */
+#endif /* RAJA_pattern_kernel_For_HPP */
diff --git a/include/RAJA/pattern/kernel/internal/LoopData.hpp b/include/RAJA/pattern/kernel/internal/LoopData.hpp
@@ -214,7 +214,7 @@ struct GenericWrapper : GenericWrapperBase {
 
 
 /*!
- * Convenience object used to create thread-private a LoopData object.
+ * Convenience object used to create a thread-private LoopData object.
  */
 template <typename T>
 struct NestedPrivatizer {

diff --git a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
@@ -3,8 +3,7 @@
  *
  * \file
  *
- * \brief   Header file for loop kernel internals: LoopData structure and
- *          related helper functions.
+ * \brief   Header file for loop kernel internals and related helper functions.
  *
  ******************************************************************************
  */
@@ -93,4 +92,4 @@ using setSegmentTypeFromData =
 }  // end namespace RAJA
 
 
-#endif /* RAJA_pattern_kernel_internal_LoopData_HPP */
+#endif /* RAJA_pattern_kernel_internal_LoopTypes_HPP */
diff --git a/include/RAJA/pattern/kernel/internal/Template.hpp b/include/RAJA/pattern/kernel/internal/Template.hpp
@@ -3,8 +3,7 @@
  *
  * \file
  *
- * \brief   Header file for loop kernel internals: LoopData structure and
- *          related helper functions.
+ * \brief   Header file for loop kernel internals and helper functions.
  *
  ******************************************************************************
  */
@@ -83,4 +82,4 @@ using tuple_of_n = typename detail::TupleOfNHelper<T, camp::make_idx_seq_t<N>>::
 }  // end namespace RAJA
 
 
-#endif /* RAJA_pattern_kernel_internal_LoopData_HPP */
+#endif /* RAJA_pattern_kernel_internal_Template_HPP */
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
@@ -374,23 +374,21 @@ void launch(ExecPlace place, const LaunchParams &launch_params, ReduceParams&&..
 }
 
 
-
-
 // Helper function to retrieve a resource based on the run-time policy - if a device is active
-#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL)
 template<typename T, typename U>
 RAJA::resources::Resource Get_Runtime_Resource(T host_res, U device_res, RAJA::ExecPlace device){
   if(device == RAJA::ExecPlace::DEVICE) {return RAJA::resources::Resource(device_res);}
   else { return RAJA::resources::Resource(host_res); }
 }
-#else
+#endif
+
 template<typename T>
 RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device){
   if(device == RAJA::ExecPlace::DEVICE) {RAJA_ABORT_OR_THROW("Device is not enabled");}
 
   return RAJA::resources::Resource(host_res);
 }
-#endif
 
 //Launch API which takes team resource struct and supports new reducers
 template <typename POLICY_LIST, typename ... ReduceParams>
Original file line number	Diff line number	Diff line change
Expand Up		@@ -145,4 +145,4 @@ struct StatementExecutor<
		} // end namespace RAJA


		#endif /* RAJA_pattern_nested_HPP */
		#endif /* RAJA_pattern_kernel_For_HPP */