sbryngelson · sbryngelson · Apr 4, 2026 · Apr 4, 2026 · Apr 4, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -19,7 +19,6 @@ project(figr LANGUAGES C CXX Fortran)
 
 option(FIGR_OpenACC       "Build with OpenACC"                                OFF)
 option(FIGR_OpenMP        "Build with OpenMP"                                 OFF)
-option(FIGR_Unified       "Build with unified CPU & GPU memory (GH-200 only)" OFF)
 option(FIGR_PRE_PROCESS   "Build pre_process"                                 OFF)
 option(FIGR_SIMULATION    "Build simulation"                                  OFF)
 option(FIGR_POST_PROCESS  "Build post_process"                                OFF)
@@ -263,12 +262,8 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release")
 
     # Enable LTO/IPO if supported
     if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC")
-        if (FIGR_Unified)
-            message(STATUS "LTO/IPO is not available with NVHPC using Unified Memory")
-        else()
-            message(STATUS "Performing IPO using -Mextract followed by -Minline")
-            set(NVHPC_USE_TWO_PASS_IPO TRUE)
-        endif()
+        message(STATUS "Performing IPO using -Mextract followed by -Minline")
+        set(NVHPC_USE_TWO_PASS_IPO TRUE)
     else()
         CHECK_IPO_SUPPORTED(RESULT SUPPORTS_IPO OUTPUT IPO_ERROR)
         if (SUPPORTS_IPO)
@@ -571,39 +566,15 @@ function(FIGR_SETUP_TARGET)
                 )
 
 
-                # GH-200 Unified Memory Support
-                if (FIGR_Unified)
-                    target_compile_options(${ARGS_TARGET}
-                        PRIVATE -gpu=mem:unified:managedalloc -cuda
-                    )
-                    # "This option must appear in both the compile and link lines" -- NVHPC Docs
-                    target_link_options(${ARGS_TARGET}
-                        PRIVATE -gpu=mem:unified:managedalloc -cuda
-                    )
-                endif()
-
                 if (CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelDebug")
                     target_compile_options(${a_target}
                         PRIVATE -gpu=debug
                     )
                 endif()
             elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
-                # Frontier Unified Memory Support
-                if (FIGR_Unified)
-                    target_compile_options(${ARGS_TARGET}
-                        PRIVATE -DFRONTIER_UNIFIED)
-                endif()
-
-
                 find_package(hipfort COMPONENTS hip CONFIG REQUIRED)
                 target_link_libraries(${a_target} PRIVATE hipfort::hip hipfort::hipfort-amdgcn)
             elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
-
-                if (FIGR_Unified)
-                    target_compile_options(${ARGS_TARGET}
-                        PRIVATE -DFRONTIER_UNIFIED)
-                endif()
-
                 find_package(hipfort COMPONENTS hip CONFIG REQUIRED)
                 target_link_libraries(${a_target} PRIVATE hipfort::hip hipfort::hipfort-amdgcn flang_rt.hostdevice)
             endif()

diff --git a/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py
@@ -94,10 +94,6 @@
             "fluid_pp(1)%gamma": 1.0e00 / (1.4 - 1),
             "fluid_pp(1)%pi_inf": 0,
             "fluid_pp(1)%Re(1)": 1 / mu,
-            # NVIDIA UVM Options
-            "nv_uvm_out_of_core": "T",
-            "nv_uvm_igr_temps_on_gpu": 3,
-            "nv_uvm_pref_gpu": "T",
         }
     )
 )
diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
@@ -12,48 +12,7 @@
 #endif
 #:enddef
 
-! Caution: This macro requires the use of a binding script to set CUDA_VISIBLE_DEVICES, such that we have one GPU device per MPI
-! rank. That's because for both cudaMemAdvise (preferred location) and cudaMemPrefetchAsync we use location = device_id = 0. For an
-! example see misc/nvidia_uvm/bind.sh. NVIDIA unified memory page placement hint
-#:def PREFER_GPU(*args)
-#ifdef FIGR_SIMULATION
-#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
-    block
-        ! NVIDIA CUDA Fortran 25.3+: uses submodules (cuda_runtime_api, gpu_reductions, sort) See
-        ! https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html#fortran-host-modules
-#if __NVCOMPILER_MAJOR__ < 25 || (__NVCOMPILER_MAJOR__ == 25 && __NVCOMPILER_MINOR__ < 3)
-        use cudafor, gpu_sum => sum, gpu_maxval => maxval, gpu_minval => minval
-#else
-        use cuda_runtime_api
-#endif
-        integer :: istat
-
-        if (nv_uvm_pref_gpu) then
-            #:for arg in args
-                ! print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$) set preferred location GPU
-                istat = cudaMemAdvise(c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0)
-                if (istat /= cudaSuccess) then
-                    write (*, "('Error code: ',I0, ': ')") istat
-                    ! write(*,*) cudaGetErrorString(istat)
-                end if
-                ! set accessed by CPU
-                istat = cudaMemAdvise(c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId)
-                if (istat /= cudaSuccess) then
-                    write (*, "('Error code: ',I0, ': ')") istat
-                    ! write(*,*) cudaGetErrorString(istat)
-                end if
-                ! prefetch to GPU - physically populate memory pages
-                istat = cudaMemPrefetchAsync(c_devloc(${arg}$), SIZEOF(${arg}$), 0, 0)
-                if (istat /= cudaSuccess) then
-                    write (*, "('Error code: ',I0, ': ')") istat
-                    ! write(*,*) cudaGetErrorString(istat)
-                end if
-            #:endfor
-        end if
-    end block
-#endif
-#endif
-#:enddef
+! No-op macro (UVM infrastructure removed)
 
 ! Allocate and create GPU device memory
 #:def ALLOCATE(*args)

diff --git a/src/simulation/m_checker.fpp b/src/simulation/m_checker.fpp
@@ -21,7 +21,6 @@ contains
     impure subroutine s_check_inputs
 
         call s_check_inputs_compilers
-        call s_check_inputs_nvidia_uvm
         call s_check_inputs_time_stepping
 
     end subroutine s_check_inputs
@@ -40,16 +39,4 @@ contains
 
     end subroutine s_check_inputs_time_stepping
 
-    !> Validate NVIDIA unified virtual memory configuration parameters
-    impure subroutine s_check_inputs_nvidia_uvm
-
-#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
-        @:PROHIBIT(nv_uvm_igr_temps_on_gpu > 3 .or. nv_uvm_igr_temps_on_gpu < 0, &
-                   & "nv_uvm_igr_temps_on_gpu must be in the range [0, 3]")
-        @:PROHIBIT(nv_uvm_igr_temps_on_gpu == 3 .and. igr_iter_solver == 2, &
-                   & "nv_uvm_igr_temps_on_gpu must be in the range [0, 2] for igr_iter_solver == 2")
-#endif
-
-    end subroutine s_check_inputs_nvidia_uvm
-
 end module m_checker
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
@@ -88,11 +88,6 @@ module m_global_parameters
         logical  :: viscous            !< Viscous effects
     #:endif
 
-    logical :: nv_uvm_out_of_core       !< Enable out-of-core storage of q_cons_ts(2) in timestepping (default FALSE)
-    integer :: nv_uvm_igr_temps_on_gpu  !< 0 => jac, jac_rhs, and jac_old on CPU
-    ! 1 => jac on GPU, jac_rhs and jac_old on CPU 2 => jac and jac_rhs on GPU, jac_old on CPU 3 => jac, jac_rhs, and jac_old on GPU
-    ! (default)
-    logical :: nv_uvm_pref_gpu  !< Enable explicit gpu memory hints (default FALSE)
     integer  :: num_igr_iters             !< number of iterations for elliptic solve
     integer  :: num_igr_warm_start_iters  !< number of warm start iterations for elliptic solve
     real(wp) :: alf_factor                !< alpha factor for IGR
@@ -232,11 +227,6 @@ contains
         t_stop = dflt_real
         t_save = dflt_real
 
-        ! NVIDIA UVM options
-        nv_uvm_out_of_core = .false.
-        nv_uvm_igr_temps_on_gpu = 3  ! => jac, jac_rhs, and jac_old on GPU (default)
-        nv_uvm_pref_gpu = .false.
-
         ! Simulation algorithm parameters
         model_eqns = dflt_int
         time_stepper = dflt_int
@@ -418,25 +408,16 @@ contains
         @:ALLOCATE(x_cb(-1 - buff_size:m + buff_size))
         @:ALLOCATE(x_cc(-buff_size:m + buff_size))
         @:ALLOCATE(dx(-buff_size:m + buff_size))
-        @:PREFER_GPU(x_cb)
-        @:PREFER_GPU(x_cc)
-        @:PREFER_GPU(dx)
 
         if (n == 0) return
         @:ALLOCATE(y_cb(-1 - buff_size:n + buff_size))
         @:ALLOCATE(y_cc(-buff_size:n + buff_size))
         @:ALLOCATE(dy(-buff_size:n + buff_size))
-        @:PREFER_GPU(y_cb)
-        @:PREFER_GPU(y_cc)
-        @:PREFER_GPU(dy)
 
         if (p == 0) return
         @:ALLOCATE(z_cb(-1 - buff_size:p + buff_size))
         @:ALLOCATE(z_cc(-buff_size:p + buff_size))
         @:ALLOCATE(dz(-buff_size:p + buff_size))
-        @:PREFER_GPU(z_cb)
-        @:PREFER_GPU(z_cc)
-        @:PREFER_GPU(dz)
 
     end subroutine s_initialize_global_parameters_module
 

diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp
@@ -16,17 +16,9 @@ module m_igr
     private; public :: s_initialize_igr_module, s_igr_iterative_solve, s_igr_riemann_solver, s_igr_sigma_x, s_igr_flux_add, &
         & s_finalize_igr_module
 
-#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
-    integer, dimension(3)                                   :: nv_uvm_temp_on_gpu
-    real(wp), pointer, contiguous, dimension(:,:,:)         :: jac, jac_rhs, jac_old
-    real(wp), allocatable, dimension(:,:,:), pinned, target :: jac_host
-    real(wp), allocatable, dimension(:,:,:), pinned, target :: jac_rhs_host
-    real(wp), allocatable, dimension(:,:,:), pinned, target :: jac_old_host
-#else
     real(wp), allocatable, target, dimension(:,:,:) :: jac
     real(wp), allocatable, dimension(:,:,:)         :: jac_rhs, jac_old
     $:GPU_DECLARE(create='[jac, jac_rhs, jac_old]')
-#endif
     type(scalar_field), dimension(1) :: jac_sf
     $:GPU_DECLARE(create='[jac_sf]')
 
@@ -87,51 +79,14 @@ contains
                 end do
             end do
             $:GPU_UPDATE(device='[Res_igr, Re_idx, Re_size]')
-            @:PREFER_GPU(Res_igr)
-            @:PREFER_GPU(Re_idx)
         end if
 
-#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
         @:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, idwbuff(3)%beg:idwbuff(3)%end))
         @:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p))
 
         if (igr_iter_solver == 1) then  ! Jacobi iteration
             @:ALLOCATE(jac_old(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, idwbuff(3)%beg:idwbuff(3)%end))
         end if
-#else
-        ! create map
-        nv_uvm_temp_on_gpu(1:3) = 0
-        nv_uvm_temp_on_gpu(1:nv_uvm_igr_temps_on_gpu) = 1
-
-        if (nv_uvm_temp_on_gpu(1) == 1) then
-            @:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, idwbuff(3)%beg:idwbuff(3)%end))
-            @:PREFER_GPU(jac)
-        else
-            allocate (jac_host(idwbuff(1)%beg:idwbuff(1)%end,idwbuff(2)%beg:idwbuff(2)%end,idwbuff(3)%beg:idwbuff(3)%end))
-
-            jac(idwbuff(1)%beg:idwbuff(1)%end,idwbuff(2)%beg:idwbuff(2)%end,idwbuff(3)%beg:idwbuff(3)%end) => jac_host(:,:,:)
-        end if
-
-        if (nv_uvm_temp_on_gpu(2) == 1) then
-            @:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p))
-            @:PREFER_GPU(jac_rhs)
-        else
-            allocate (jac_rhs_host(-1:m,-1:n,-1:p))
-            jac_rhs(-1:m,-1:n,-1:p) => jac_rhs_host(:,:,:)
-        end if
-
-        if (igr_iter_solver == 1) then  ! Jacobi iteration
-            if (nv_uvm_temp_on_gpu(3) == 1) then
-                @:ALLOCATE(jac_old(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, idwbuff(3)%beg:idwbuff(3)%end))
-                @:PREFER_GPU(jac_old)
-            else
-                allocate (jac_old_host(idwbuff(1)%beg:idwbuff(1)%end,idwbuff(2)%beg:idwbuff(2)%end,idwbuff(3)%beg:idwbuff(3)%end))
-
-                jac_old(idwbuff(1)%beg:idwbuff(1)%end,idwbuff(2)%beg:idwbuff(2)%end, &
-                        & idwbuff(3)%beg:idwbuff(3)%end) => jac_old_host(:,:,:)
-            end if
-        end if
-#endif
 
         $:GPU_PARALLEL_LOOP(private='[j, k, l]', collapse=3)
         do l = idwbuff(3)%beg, idwbuff(3)%end
@@ -2704,36 +2659,11 @@ contains
             @:DEALLOCATE(Res_igr)
         end if
 
-#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
         @:DEALLOCATE(jac, jac_rhs)
 
         if (igr_iter_solver == 1) then  ! Jacobi iteration
             @:DEALLOCATE(jac_old)
         end if
-#else
-        if (nv_uvm_temp_on_gpu(1) == 1) then
-            @:DEALLOCATE(jac)
-        else
-            nullify (jac)
-            deallocate (jac_host)
-        end if
-
-        if (nv_uvm_temp_on_gpu(2) == 1) then
-            @:DEALLOCATE(jac_rhs)
-        else
-            nullify (jac_rhs)
-            deallocate (jac_rhs_host)
-        end if
-
-        if (igr_iter_solver == 1) then  ! Jacobi iteration
-            if (nv_uvm_temp_on_gpu(3) == 1) then
-                @:DEALLOCATE(jac_old)
-            else
-                nullify (jac_old)
-                deallocate (jac_old_host)
-            end if
-        end if
-#endif
 
         #:if not FIGR_CASE_OPTIMIZATION
             @:DEALLOCATE(coeff_L, coeff_R)

diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
@@ -88,11 +88,6 @@ contains
             #:endfor
         end do
 
-        ! NVIDIA UVM variables
-        call MPI_BCAST(nv_uvm_out_of_core, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
-        call MPI_BCAST(nv_uvm_igr_temps_on_gpu, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
-        call MPI_BCAST(nv_uvm_pref_gpu, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
-
     end subroutine s_mpi_bcast_user_inputs
 
     !> Broadcast random phase numbers from rank 0 to all MPI processes

diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
@@ -76,7 +76,7 @@ contains
             igr_iter_solver, igr_pres_lim, &
         #:endif
         file_per_process, n_start, t_save, t_stop, cfl_adap_dt, cfl_const_dt, cfl_target, num_bc_patches, alf_factor, &
-            & num_igr_iters, num_igr_warm_start_iters, nv_uvm_out_of_core, nv_uvm_igr_temps_on_gpu, nv_uvm_pref_gpu, down_sample, &
+            & num_igr_iters, num_igr_warm_start_iters, down_sample, &
             & double_mach
 
         inquire (FILE=trim(file_path), EXIST=file_exist)
@@ -575,9 +575,7 @@ contains
         call cpu_time(start)
         call nvtxStartRange("SAVE-DATA")
         do i = 1, sys_size
-#ifndef FRONTIER_UNIFIED
             $:GPU_UPDATE(host='[q_cons_ts(stor)%vf(i)%sf]')
-#endif
             do l = 0, p
                 do k = 0, n
                     do j = 0, m