Skip to content

Commit

Permalink
Merge branch 'develop' into woptim/build-cache
Browse files Browse the repository at this point in the history
  • Loading branch information
adrienbernede committed Jun 6, 2024
2 parents 5cc75c4 + dfaab80 commit bc99666
Show file tree
Hide file tree
Showing 24 changed files with 363 additions and 276 deletions.
1 change: 1 addition & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ trigger-rajaperf:
strategy: depend

include:
# Sets ID tokens for every job using `default:`
- project: 'lc-templates/id_tokens'
file: 'id_tokens.yml'
# [Optional] checks preliminary to running the actual CI test
Expand Down
8 changes: 4 additions & 4 deletions .gitlab/jobs/tioga.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@
# ${PROJECT_<MACHINE>_DEPS} in the extra jobs. There is no reason not to fully
# describe the spec here.

rocmcc_5_7_1_hip_desul_atomics:
rocmcc_6_1_1_hip_desul_atomics:
variables:
SPEC: "~shared +rocm ~openmp +desul +tests amdgpu_target=gfx90a %rocmcc@=5.7.1 ^hip@5.7.1 ^blt@develop"
SPEC: "~shared +rocm ~openmp +desul +tests amdgpu_target=gfx90a %rocmcc@=6.1.1 ^hip@6.1.1 ^blt@develop"
extends: .job_on_tioga

rocmcc_5_7_1_hip_openmp:
rocmcc_6_1_1_hip_openmp:
variables:
SPEC: "~shared +rocm +openmp +omptask +tests amdgpu_target=gfx90a %rocmcc@=5.7.1 ^hip@5.7.1 ^blt@develop"
SPEC: "~shared +rocm +openmp +omptask +tests amdgpu_target=gfx90a %rocmcc@=6.1.1 ^hip@6.1.1 ^blt@develop"
extends: .job_on_tioga
2 changes: 1 addition & 1 deletion .uberenv_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"package_final_phase" : "initconfig",
"package_source_dir" : "../..",
"spack_url": "https://github.com/spack/spack.git",
"spack_branch": "bugfix/invalid-compiler-warning",
"spack_branch": "develop-2024-05-26",
"spack_activate" : {},
"spack_configs_path": "scripts/radiuss-spack-configs",
"spack_packages_path": "scripts/radiuss-spack-configs/packages",
Expand Down
91 changes: 51 additions & 40 deletions examples/dynamic_mat_transpose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
#include <iostream>

#include "RAJA/RAJA.hpp"
#include "memoryManager.hpp"

/*
* Matrix Transpose Example
Expand Down Expand Up @@ -96,7 +95,7 @@ using outer0 = RAJA::LoopPolicy<
#endif
#if defined(RAJA_ENABLE_SYCL)
,
RAJA::sycl_group_0_direct
RAJA::sycl_group_2_direct
#endif
>;

Expand Down Expand Up @@ -135,7 +134,7 @@ using inner0 = RAJA::LoopPolicy<
#endif
#if defined(RAJA_ENABLE_SYCL)
,
RAJA::sycl_local_0_direct
RAJA::sycl_local_2_direct
#endif
>;

Expand All @@ -154,20 +153,9 @@ using inner1 = RAJA::LoopPolicy<RAJA::seq_exec
#endif
>;

template<typename T>
void switch_ptrs(T *A, T *d_A)
{
T *tmp_ptr;
tmp_ptr = d_A;
d_A = A;
A = tmp_ptr;
}

int main(int argc, char *argv[])
{

std::cout << "\n\nRAJA matrix transpose example...\n";

if(argc != 2) {
RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device");
}
Expand All @@ -185,17 +173,26 @@ int main(int argc, char *argv[])

RAJA::ExecPlace select_cpu_or_gpu;
if(exec_space.compare("host") == 0)
{ select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA::launch reductions example on the host \n"); }
{ select_cpu_or_gpu = RAJA::ExecPlace::HOST; std::cout<<"Running RAJA::launch matrix transpose example on the host"<<std::endl; }
if(exec_space.compare("device") == 0)
{ select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA::launch reductions example on the device \n"); }


{ select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; std::cout<<"Running RAJA::launch matrix transpose example on the device" <<std::endl; }

RAJA::resources::Host host_res;
#if defined(RAJA_ENABLE_CUDA)
RAJA::resources::Cuda device_res;
#endif
#if defined(RAJA_ENABLE_HIP)
RAJA::resources::Hip device_res;
#endif
#if defined(RAJA_ENABLE_SYCL)
memoryManager::sycl_res = new camp::resources::Resource{camp::resources::Sycl()};
::RAJA::sycl::detail::setQueue(memoryManager::sycl_res);
RAJA::resources::Sycl device_res;
#endif

#if defined(RAJA_GPU_ACTIVE)
RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
#else
RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
#endif
//
// Define num rows/cols in matrix, tile dimensions, and number of tiles
//
Expand All @@ -212,9 +209,8 @@ int main(int argc, char *argv[])
//
// Allocate matrix data
//
int *A = memoryManager::allocate<int>(N_r * N_c);
int *At = memoryManager::allocate<int>(N_r * N_c);

int *A = host_res.allocate<int>(N_r * N_c);
int *At = host_res.allocate<int>(N_r * N_c);
//
// In the following implementations of matrix transpose, we
// use RAJA 'View' objects to access the matrix data. A RAJA view
Expand Down Expand Up @@ -300,20 +296,24 @@ int main(int argc, char *argv[])

std::cout << "\n Running RAJA matrix transpose w/ dynamic shared memory ...\n";

#if defined(RAJA_ENABLE_HIP)
//Reset memory
std::memset(At, 0, N_r * N_c * sizeof(int));

//Hip requires device side pointers
#if defined(RAJA_GPU_ACTIVE)
//Allocate device side pointers
int *d_A = nullptr, *d_At = nullptr;

if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
d_At = memoryManager::allocate_gpu<int>(N_r * N_c);

hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
d_A = device_res.allocate<int>(N_r * N_c);
d_At = device_res.allocate<int>(N_r * N_c);

device_res.memcpy(d_A, A, sizeof(int) * N_r * N_c);
device_res.memcpy(d_At, At, sizeof(int) * N_r * N_c);

//switch host/device pointers so we can reuse the views
switch_ptrs(d_A, A);
switch_ptrs(d_At, At);
Aview.set_data(d_A);
Atview.set_data(d_At);
}
#endif

Expand All @@ -323,13 +323,11 @@ int main(int argc, char *argv[])

// _dynamic_mattranspose_kernel_start
RAJA::launch<launch_policy>
(select_cpu_or_gpu,
RAJA::LaunchParams(RAJA::Teams(outer_Dimr, outer_Dimc),
RAJA::Threads(TILE_DIM, TILE_DIM), dynamic_shared_mem_size),
(res, RAJA::LaunchParams(RAJA::Teams(outer_Dimc, outer_Dimr),
RAJA::Threads(TILE_DIM, TILE_DIM), dynamic_shared_mem_size),
"Matrix tranpose with dynamic shared memory kernel",
[=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
{

RAJA::loop<outer1>(ctx, RAJA::RangeSegment(0, outer_Dimr), [&] (int by){
RAJA::loop<outer0>(ctx, RAJA::RangeSegment(0, outer_Dimc), [&] (int bx){

Expand Down Expand Up @@ -378,24 +376,37 @@ int main(int argc, char *argv[])
ctx.releaseSharedMemory();
});
});

});
// _dynamic_mattranspose_kernel_end


#if defined(RAJA_ENABLE_HIP)
#if defined(RAJA_GPU_ACTIVE)
if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
switch_ptrs(d_At, At);
switch_ptrs(d_A, A);

hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
device_res.memcpy(A, d_A, sizeof(int) * N_r * N_c);
device_res.memcpy(At, d_At, sizeof(int) * N_r * N_c);

Aview.set_data(A);
Atview.set_data(At);
}
#endif


checkResult<int>(Atview, N_c, N_r);
//printResult<int>(Atview, N_c, N_r);
//----------------------------------------------------------------------------//

//Release data
host_res.deallocate(A);
host_res.deallocate(At);

#if defined(RAJA_GPU_ACTIVE)
if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
device_res.deallocate(d_A);
device_res.deallocate(d_At);
}
#endif


return 0;
}

Expand Down
5 changes: 4 additions & 1 deletion examples/resource-dynamic-forall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,12 @@ int main(int argc, char *argv[])
#if defined(RAJA_ENABLE_HIP)
RAJA::resources::Hip device_res;
#endif
#if defined(RAJA_ENABLE_SYCL)
RAJA::resources::Sycl device_res;
#endif

//Get typed erased resource - it will internally store if we are running on the host or device
#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL)
RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
#else
RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
Expand Down
2 changes: 1 addition & 1 deletion examples/resource-runtime-launch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ int main(int argc, char *argv[])
#endif

//Get typed erased resource - it will internally store if we are running on the host or device
#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
#if defined(RAJA_GPU_ACTIVE) && !defined(RAJA_ENABLE_SYCL)
RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
#else
RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
Expand Down
2 changes: 1 addition & 1 deletion include/RAJA/pattern/kernel/For.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,4 +145,4 @@ struct StatementExecutor<
} // end namespace RAJA


#endif /* RAJA_pattern_nested_HPP */
#endif /* RAJA_pattern_kernel_For_HPP */
2 changes: 1 addition & 1 deletion include/RAJA/pattern/kernel/internal/LoopData.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ struct GenericWrapper : GenericWrapperBase {


/*!
* Convenience object used to create thread-private a LoopData object.
* Convenience object used to create a thread-private LoopData object.
*/
template <typename T>
struct NestedPrivatizer {
Expand Down
5 changes: 2 additions & 3 deletions include/RAJA/pattern/kernel/internal/LoopTypes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
*
* \file
*
* \brief Header file for loop kernel internals: LoopData structure and
* related helper functions.
* \brief Header file for loop kernel internals and related helper functions.
*
******************************************************************************
*/
Expand Down Expand Up @@ -93,4 +92,4 @@ using setSegmentTypeFromData =
} // end namespace RAJA


#endif /* RAJA_pattern_kernel_internal_LoopData_HPP */
#endif /* RAJA_pattern_kernel_internal_LoopTypes_HPP */
5 changes: 2 additions & 3 deletions include/RAJA/pattern/kernel/internal/Template.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
*
* \file
*
* \brief Header file for loop kernel internals: LoopData structure and
* related helper functions.
* \brief Header file for loop kernel internals and helper functions.
*
******************************************************************************
*/
Expand Down Expand Up @@ -83,4 +82,4 @@ using tuple_of_n = typename detail::TupleOfNHelper<T, camp::make_idx_seq_t<N>>::
} // end namespace RAJA


#endif /* RAJA_pattern_kernel_internal_LoopData_HPP */
#endif /* RAJA_pattern_kernel_internal_Template_HPP */
8 changes: 3 additions & 5 deletions include/RAJA/pattern/launch/launch_core.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -374,23 +374,21 @@ void launch(ExecPlace place, const LaunchParams &launch_params, ReduceParams&&..
}




// Helper function to retrieve a resource based on the run-time policy - if a device is active
#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL)
template<typename T, typename U>
RAJA::resources::Resource Get_Runtime_Resource(T host_res, U device_res, RAJA::ExecPlace device){
if(device == RAJA::ExecPlace::DEVICE) {return RAJA::resources::Resource(device_res);}
else { return RAJA::resources::Resource(host_res); }
}
#else
#endif

template<typename T>
RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device){
if(device == RAJA::ExecPlace::DEVICE) {RAJA_ABORT_OR_THROW("Device is not enabled");}

return RAJA::resources::Resource(host_res);
}
#endif

//Launch API which takes team resource struct and supports new reducers
template <typename POLICY_LIST, typename ... ReduceParams>
Expand Down
Loading

0 comments on commit bc99666

Please sign in to comment.