Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 2 additions & 31 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ project(figr LANGUAGES C CXX Fortran)

option(FIGR_OpenACC "Build with OpenACC" OFF)
option(FIGR_OpenMP "Build with OpenMP" OFF)
option(FIGR_Unified "Build with unified CPU & GPU memory (GH-200 only)" OFF)
option(FIGR_PRE_PROCESS "Build pre_process" OFF)
option(FIGR_SIMULATION "Build simulation" OFF)
option(FIGR_POST_PROCESS "Build post_process" OFF)
Expand Down Expand Up @@ -263,12 +262,8 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release")

# Enable LTO/IPO if supported
if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC")
if (FIGR_Unified)
message(STATUS "LTO/IPO is not available with NVHPC using Unified Memory")
else()
message(STATUS "Performing IPO using -Mextract followed by -Minline")
set(NVHPC_USE_TWO_PASS_IPO TRUE)
endif()
message(STATUS "Performing IPO using -Mextract followed by -Minline")
set(NVHPC_USE_TWO_PASS_IPO TRUE)
else()
CHECK_IPO_SUPPORTED(RESULT SUPPORTS_IPO OUTPUT IPO_ERROR)
if (SUPPORTS_IPO)
Expand Down Expand Up @@ -571,39 +566,15 @@ function(FIGR_SETUP_TARGET)
)


# GH-200 Unified Memory Support
if (FIGR_Unified)
target_compile_options(${ARGS_TARGET}
PRIVATE -gpu=mem:unified:managedalloc -cuda
)
# "This option must appear in both the compile and link lines" -- NVHPC Docs
target_link_options(${ARGS_TARGET}
PRIVATE -gpu=mem:unified:managedalloc -cuda
)
endif()

if (CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelDebug")
target_compile_options(${a_target}
PRIVATE -gpu=debug
)
endif()
elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
# Frontier Unified Memory Support
if (FIGR_Unified)
target_compile_options(${ARGS_TARGET}
PRIVATE -DFRONTIER_UNIFIED)
endif()


find_package(hipfort COMPONENTS hip CONFIG REQUIRED)
target_link_libraries(${a_target} PRIVATE hipfort::hip hipfort::hipfort-amdgcn)
elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")

if (FIGR_Unified)
target_compile_options(${ARGS_TARGET}
PRIVATE -DFRONTIER_UNIFIED)
endif()

find_package(hipfort COMPONENTS hip CONFIG REQUIRED)
target_link_libraries(${a_target} PRIVATE hipfort::hip hipfort::hipfort-amdgcn flang_rt.hostdevice)
endif()
Expand Down
4 changes: 0 additions & 4 deletions examples/3D_IGR_TaylorGreenVortex_nvidia/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,6 @@
"fluid_pp(1)%gamma": 1.0e00 / (1.4 - 1),
"fluid_pp(1)%pi_inf": 0,
"fluid_pp(1)%Re(1)": 1 / mu,
# NVIDIA UVM Options
"nv_uvm_out_of_core": "T",
"nv_uvm_igr_temps_on_gpu": 3,
"nv_uvm_pref_gpu": "T",
}
)
)
43 changes: 1 addition & 42 deletions src/common/include/macros.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,48 +12,7 @@
#endif
#:enddef

! Caution: This macro requires the use of a binding script to set CUDA_VISIBLE_DEVICES, such that we have one GPU device per MPI
! rank. That's because for both cudaMemAdvise (preferred location) and cudaMemPrefetchAsync we use location = device_id = 0. For an
! example see misc/nvidia_uvm/bind.sh. NVIDIA unified memory page placement hint
#:def PREFER_GPU(*args)
#ifdef FIGR_SIMULATION
#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
block
! NVIDIA CUDA Fortran 25.3+: uses submodules (cuda_runtime_api, gpu_reductions, sort) See
! https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html#fortran-host-modules
#if __NVCOMPILER_MAJOR__ < 25 || (__NVCOMPILER_MAJOR__ == 25 && __NVCOMPILER_MINOR__ < 3)
use cudafor, gpu_sum => sum, gpu_maxval => maxval, gpu_minval => minval
#else
use cuda_runtime_api
#endif
integer :: istat

if (nv_uvm_pref_gpu) then
#:for arg in args
! print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$) set preferred location GPU
istat = cudaMemAdvise(c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0)
if (istat /= cudaSuccess) then
write (*, "('Error code: ',I0, ': ')") istat
! write(*,*) cudaGetErrorString(istat)
end if
! set accessed by CPU
istat = cudaMemAdvise(c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId)
if (istat /= cudaSuccess) then
write (*, "('Error code: ',I0, ': ')") istat
! write(*,*) cudaGetErrorString(istat)
end if
! prefetch to GPU - physically populate memory pages
istat = cudaMemPrefetchAsync(c_devloc(${arg}$), SIZEOF(${arg}$), 0, 0)
if (istat /= cudaSuccess) then
write (*, "('Error code: ',I0, ': ')") istat
! write(*,*) cudaGetErrorString(istat)
end if
#:endfor
end if
end block
#endif
#endif
#:enddef
! No-op macro (UVM infrastructure removed)

! Allocate and create GPU device memory
#:def ALLOCATE(*args)
Expand Down
13 changes: 0 additions & 13 deletions src/simulation/m_checker.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ contains
impure subroutine s_check_inputs

call s_check_inputs_compilers
call s_check_inputs_nvidia_uvm
call s_check_inputs_time_stepping

end subroutine s_check_inputs
Expand All @@ -40,16 +39,4 @@ contains

end subroutine s_check_inputs_time_stepping

!> Validate NVIDIA unified virtual memory configuration parameters
impure subroutine s_check_inputs_nvidia_uvm

#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
@:PROHIBIT(nv_uvm_igr_temps_on_gpu > 3 .or. nv_uvm_igr_temps_on_gpu < 0, &
& "nv_uvm_igr_temps_on_gpu must be in the range [0, 3]")
@:PROHIBIT(nv_uvm_igr_temps_on_gpu == 3 .and. igr_iter_solver == 2, &
& "nv_uvm_igr_temps_on_gpu must be in the range [0, 2] for igr_iter_solver == 2")
#endif

end subroutine s_check_inputs_nvidia_uvm

end module m_checker
19 changes: 0 additions & 19 deletions src/simulation/m_global_parameters.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,6 @@ module m_global_parameters
logical :: viscous !< Viscous effects
#:endif

logical :: nv_uvm_out_of_core !< Enable out-of-core storage of q_cons_ts(2) in timestepping (default FALSE)
integer :: nv_uvm_igr_temps_on_gpu !< 0 => jac, jac_rhs, and jac_old on CPU
! 1 => jac on GPU, jac_rhs and jac_old on CPU 2 => jac and jac_rhs on GPU, jac_old on CPU 3 => jac, jac_rhs, and jac_old on GPU
! (default)
logical :: nv_uvm_pref_gpu !< Enable explicit gpu memory hints (default FALSE)
integer :: num_igr_iters !< number of iterations for elliptic solve
integer :: num_igr_warm_start_iters !< number of warm start iterations for elliptic solve
real(wp) :: alf_factor !< alpha factor for IGR
Expand Down Expand Up @@ -232,11 +227,6 @@ contains
t_stop = dflt_real
t_save = dflt_real

! NVIDIA UVM options
nv_uvm_out_of_core = .false.
nv_uvm_igr_temps_on_gpu = 3 ! => jac, jac_rhs, and jac_old on GPU (default)
nv_uvm_pref_gpu = .false.

! Simulation algorithm parameters
model_eqns = dflt_int
time_stepper = dflt_int
Expand Down Expand Up @@ -418,25 +408,16 @@ contains
@:ALLOCATE(x_cb(-1 - buff_size:m + buff_size))
@:ALLOCATE(x_cc(-buff_size:m + buff_size))
@:ALLOCATE(dx(-buff_size:m + buff_size))
@:PREFER_GPU(x_cb)
@:PREFER_GPU(x_cc)
@:PREFER_GPU(dx)

if (n == 0) return
@:ALLOCATE(y_cb(-1 - buff_size:n + buff_size))
@:ALLOCATE(y_cc(-buff_size:n + buff_size))
@:ALLOCATE(dy(-buff_size:n + buff_size))
@:PREFER_GPU(y_cb)
@:PREFER_GPU(y_cc)
@:PREFER_GPU(dy)

if (p == 0) return
@:ALLOCATE(z_cb(-1 - buff_size:p + buff_size))
@:ALLOCATE(z_cc(-buff_size:p + buff_size))
@:ALLOCATE(dz(-buff_size:p + buff_size))
@:PREFER_GPU(z_cb)
@:PREFER_GPU(z_cc)
@:PREFER_GPU(dz)

end subroutine s_initialize_global_parameters_module

Expand Down
70 changes: 0 additions & 70 deletions src/simulation/m_igr.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,9 @@ module m_igr
private; public :: s_initialize_igr_module, s_igr_iterative_solve, s_igr_riemann_solver, s_igr_sigma_x, s_igr_flux_add, &
& s_finalize_igr_module

#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
integer, dimension(3) :: nv_uvm_temp_on_gpu
real(wp), pointer, contiguous, dimension(:,:,:) :: jac, jac_rhs, jac_old
real(wp), allocatable, dimension(:,:,:), pinned, target :: jac_host
real(wp), allocatable, dimension(:,:,:), pinned, target :: jac_rhs_host
real(wp), allocatable, dimension(:,:,:), pinned, target :: jac_old_host
#else
real(wp), allocatable, target, dimension(:,:,:) :: jac
real(wp), allocatable, dimension(:,:,:) :: jac_rhs, jac_old
$:GPU_DECLARE(create='[jac, jac_rhs, jac_old]')
#endif
type(scalar_field), dimension(1) :: jac_sf
$:GPU_DECLARE(create='[jac_sf]')

Expand Down Expand Up @@ -87,51 +79,14 @@ contains
end do
end do
$:GPU_UPDATE(device='[Res_igr, Re_idx, Re_size]')
@:PREFER_GPU(Res_igr)
@:PREFER_GPU(Re_idx)
end if

#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
@:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, idwbuff(3)%beg:idwbuff(3)%end))
@:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p))

if (igr_iter_solver == 1) then ! Jacobi iteration
@:ALLOCATE(jac_old(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, idwbuff(3)%beg:idwbuff(3)%end))
end if
#else
! create map
nv_uvm_temp_on_gpu(1:3) = 0
nv_uvm_temp_on_gpu(1:nv_uvm_igr_temps_on_gpu) = 1

if (nv_uvm_temp_on_gpu(1) == 1) then
@:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, idwbuff(3)%beg:idwbuff(3)%end))
@:PREFER_GPU(jac)
else
allocate (jac_host(idwbuff(1)%beg:idwbuff(1)%end,idwbuff(2)%beg:idwbuff(2)%end,idwbuff(3)%beg:idwbuff(3)%end))

jac(idwbuff(1)%beg:idwbuff(1)%end,idwbuff(2)%beg:idwbuff(2)%end,idwbuff(3)%beg:idwbuff(3)%end) => jac_host(:,:,:)
end if

if (nv_uvm_temp_on_gpu(2) == 1) then
@:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p))
@:PREFER_GPU(jac_rhs)
else
allocate (jac_rhs_host(-1:m,-1:n,-1:p))
jac_rhs(-1:m,-1:n,-1:p) => jac_rhs_host(:,:,:)
end if

if (igr_iter_solver == 1) then ! Jacobi iteration
if (nv_uvm_temp_on_gpu(3) == 1) then
@:ALLOCATE(jac_old(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, idwbuff(3)%beg:idwbuff(3)%end))
@:PREFER_GPU(jac_old)
else
allocate (jac_old_host(idwbuff(1)%beg:idwbuff(1)%end,idwbuff(2)%beg:idwbuff(2)%end,idwbuff(3)%beg:idwbuff(3)%end))

jac_old(idwbuff(1)%beg:idwbuff(1)%end,idwbuff(2)%beg:idwbuff(2)%end, &
& idwbuff(3)%beg:idwbuff(3)%end) => jac_old_host(:,:,:)
end if
end if
#endif

$:GPU_PARALLEL_LOOP(private='[j, k, l]', collapse=3)
do l = idwbuff(3)%beg, idwbuff(3)%end
Expand Down Expand Up @@ -2704,36 +2659,11 @@ contains
@:DEALLOCATE(Res_igr)
end if

#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
@:DEALLOCATE(jac, jac_rhs)

if (igr_iter_solver == 1) then ! Jacobi iteration
@:DEALLOCATE(jac_old)
end if
#else
if (nv_uvm_temp_on_gpu(1) == 1) then
@:DEALLOCATE(jac)
else
nullify (jac)
deallocate (jac_host)
end if

if (nv_uvm_temp_on_gpu(2) == 1) then
@:DEALLOCATE(jac_rhs)
else
nullify (jac_rhs)
deallocate (jac_rhs_host)
end if

if (igr_iter_solver == 1) then ! Jacobi iteration
if (nv_uvm_temp_on_gpu(3) == 1) then
@:DEALLOCATE(jac_old)
else
nullify (jac_old)
deallocate (jac_old_host)
end if
end if
#endif

#:if not FIGR_CASE_OPTIMIZATION
@:DEALLOCATE(coeff_L, coeff_R)
Expand Down
5 changes: 0 additions & 5 deletions src/simulation/m_mpi_proxy.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,6 @@ contains
#:endfor
end do

! NVIDIA UVM variables
call MPI_BCAST(nv_uvm_out_of_core, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
call MPI_BCAST(nv_uvm_igr_temps_on_gpu, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
call MPI_BCAST(nv_uvm_pref_gpu, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)

end subroutine s_mpi_bcast_user_inputs

!> Broadcast random phase numbers from rank 0 to all MPI processes
Expand Down
4 changes: 1 addition & 3 deletions src/simulation/m_start_up.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ contains
igr_iter_solver, igr_pres_lim, &
#:endif
file_per_process, n_start, t_save, t_stop, cfl_adap_dt, cfl_const_dt, cfl_target, num_bc_patches, alf_factor, &
& num_igr_iters, num_igr_warm_start_iters, nv_uvm_out_of_core, nv_uvm_igr_temps_on_gpu, nv_uvm_pref_gpu, down_sample, &
& num_igr_iters, num_igr_warm_start_iters, down_sample, &
& double_mach

inquire (FILE=trim(file_path), EXIST=file_exist)
Expand Down Expand Up @@ -575,9 +575,7 @@ contains
call cpu_time(start)
call nvtxStartRange("SAVE-DATA")
do i = 1, sys_size
#ifndef FRONTIER_UNIFIED
$:GPU_UPDATE(host='[q_cons_ts(stor)%vf(i)%sf]')
#endif
do l = 0, p
do k = 0, n
do j = 0, m
Expand Down
Loading
Loading