diff --git a/.buildkite/distributed/pipeline.yml b/.buildkite/distributed/pipeline.yml index 3787a65d1b..c8a2a15a63 100644 --- a/.buildkite/distributed/pipeline.yml +++ b/.buildkite/distributed/pipeline.yml @@ -86,7 +86,7 @@ steps: slurm_mem: 120G slurm_ntasks: 4 - - label: "🕺 gpu distributed hydrostatic model tests" + - label: "🦏 gpu distributed hydrostatic model tests" key: "distributed_hydrostatic_model_gpu" env: TEST_GROUP: "distributed_hydrostatic_model" @@ -97,7 +97,7 @@ steps: slurm_ntasks: 4 slurm_gpus_per_task: 1 - - label: "🤺 cpu distributed nonhydrostatic regression" + - label: "🦍 cpu distributed nonhydrostatic regression" key: "distributed_nonhydrostatic_regression_cpu" env: TEST_GROUP: "distributed_nonhydrostatic_regression" diff --git a/src/DistributedComputations/DistributedComputations.jl b/src/DistributedComputations/DistributedComputations.jl index 0f5f28007c..f3c1880d61 100644 --- a/src/DistributedComputations/DistributedComputations.jl +++ b/src/DistributedComputations/DistributedComputations.jl @@ -23,5 +23,6 @@ include("transposable_field.jl") include("distributed_transpose.jl") include("plan_distributed_transforms.jl") include("distributed_fft_based_poisson_solver.jl") +include("distributed_fft_tridiagonal_solver.jl") end # module diff --git a/src/DistributedComputations/distributed_architectures.jl b/src/DistributedComputations/distributed_architectures.jl index b1647747be..b69c58da19 100644 --- a/src/DistributedComputations/distributed_architectures.jl +++ b/src/DistributedComputations/distributed_architectures.jl @@ -24,7 +24,7 @@ end Return `Partition` representing the division of a domain in the `x` (first), `y` (second) and `z` (third) dimension -Keyword arguments: +Keyword arguments: ================== - `x`: partitioning of the first dimension diff --git a/src/DistributedComputations/distributed_fft_based_poisson_solver.jl b/src/DistributedComputations/distributed_fft_based_poisson_solver.jl index 7c259b9e3d..c73167e10d 100644 --- a/src/DistributedComputations/distributed_fft_based_poisson_solver.jl +++ b/src/DistributedComputations/distributed_fft_based_poisson_solver.jl @@ -1,7 +1,7 @@ import FFTW using CUDA: @allowscalar -using Oceananigans.Grids: XYZRegularRG +using Oceananigans.Grids: XYZRegularRG, XYRegularRG, XZRegularRG, YZRegularRG import Oceananigans.Solvers: poisson_eigenvalues, solve! import Oceananigans.Architectures: architecture @@ -60,9 +60,9 @@ In the algorithm below, the first dimension is always the local dimension. In ou 1. `storage.zfield`, partitioned over ``(x, y)`` is initialized with the `rhs` that is ``b``. 2. Transform along ``z``. -3 Transpose + communicate to `storage.yfield` partitioned into `(Rx, Ry)` processes in ``(x, z)``. +3 Transpose `storage.zfield` + communicate to `storage.yfield` partitioned into `(Rx, Ry)` processes in ``(x, z)``. 4. Transform along ``y``. -5. Transpose + communicate to `storage.xfield` partitioned into `(Rx, Ry)` processes in ``(y, z)``. +5. Transpose `storage.yfield` + communicate to `storage.xfield` partitioned into `(Rx, Ry)` processes in ``(y, z)``. 6. Transform in ``x``. At this point the three in-place forward transforms are complete, and we @@ -91,8 +91,8 @@ Restrictions """ function DistributedFFTBasedPoissonSolver(global_grid, local_grid, planner_flag=FFTW.PATIENT) - validate_global_grid(global_grid) - validate_configuration(global_grid, local_grid) + validate_poisson_solver_distributed_grid(global_grid) + validate_poisson_solver_configuration(global_grid, local_grid) FT = Complex{eltype(local_grid)} @@ -188,38 +188,41 @@ end end # TODO: bring up to speed the PCG to remove this error -validate_global_grid(global_grid) = +validate_poisson_solver_distributed_grid(global_grid) = throw("Grids other than the RectilinearGrid are not supported in the Distributed NonhydrostaticModels") -function validate_global_grid(global_grid::RectilinearGrid) +function validate_poisson_solver_distributed_grid(global_grid::RectilinearGrid) TX, TY, TZ = topology(global_grid) if (TY == Bounded && TZ == Periodic) || (TX == Bounded && TY == Periodic) || (TX == Bounded && TZ == Periodic) - throw("NonhydrostaticModels on Distributed grids do not support topology ($TX, $TY, $TZ) at the moment. - TZ Periodic requires also TY and TX to be Periodic, while TY Periodic requires also TX to be Periodic. - Please rotate the domain to obtain the required topology") + throw("Distributed Poisson solvers do not support grids with topology ($TX, $TY, $TZ) at the moment. + A Periodic z-direction requires also the y- and and x-directions to be Periodic, while a Periodic y-direction requires also + the x-direction to be Periodic.") end - if !(global_grid isa XYZRegularRG) - throw("Stretched directions are not supported with distributed grids at the moment.") + if !(global_grid isa YZRegularRG) && !(global_grid isa XYRegularRG) && !(global_grid isa XZRegularRG) + throw("The provided grid is stretched in directions $(stretched_dimensions(global_grid)). + A distributed Poisson solver supports only RectilinearGrids that have variably-spaced cells in at most one direction.") end return nothing end -function validate_configuration(global_grid, local_grid) +function validate_poisson_solver_configuration(global_grid, local_grid) # We don't support distributing anything in z. - Rz = architecture(local_grid).ranks[3] - Rz == 1 || throw("Non-singleton ranks in the vertical are not supported by DistributedFFTBasedPoissonSolver.") + Rx, Ry, Rz = architecture(local_grid).ranks + Rz == 1 || throw("Non-singleton ranks in the vertical are not supported by distributed Poisson solvers.") # Limitation of the current implementation (see the docstring) - if global_grid.Nz % architecture(local_grid).ranks[2] != 0 - throw("The number of ranks in the y direction must divide Nz. See the docstring for more information.") + if global_grid.Nz % Ry != 0 + throw("The number of ranks in the y-direction are $(Ry) with Nz = $(global_grid.Nz) cells in the z-direction. + The distributed Poisson solver requires that the number of ranks in the y-direction divide Nz.") end - if global_grid.Ny % architecture(local_grid).ranks[1] != 0 - throw("The number of ranks in the x direction must divide Ny. See the docstring for more information.") + if global_grid.Ny % Rx != 0 + throw("The number of ranks in the y-direction are $(Rx) with Ny = $(global_grid.Ny) cells in the y-direction. + The distributed Poisson solver requires that the number of ranks in the x-direction divide Ny.") end return nothing diff --git a/src/DistributedComputations/distributed_fft_tridiagonal_solver.jl b/src/DistributedComputations/distributed_fft_tridiagonal_solver.jl new file mode 100644 index 0000000000..8911cb98d8 --- /dev/null +++ b/src/DistributedComputations/distributed_fft_tridiagonal_solver.jl @@ -0,0 +1,352 @@ +using CUDA: @allowscalar +using Oceananigans.Grids: stretched_dimensions +using Oceananigans.Grids: XDirection, YDirection +using Oceananigans.Operators: Δxᶠᵃᵃ, Δyᵃᶠᵃ, Δzᵃᵃᶠ + +using Oceananigans.Solvers: BatchedTridiagonalSolver, + stretched_direction, + ZTridiagonalSolver, + YTridiagonalSolver, + XTridiagonalSolver, + compute_main_diagonal! + +struct DistributedFourierTridiagonalPoissonSolver{G, L, B, P, R, S, β} + plan :: P + global_grid :: G + local_grid :: L + batched_tridiagonal_solver :: B + source_term :: R + storage :: S + buffer :: β +end + +# Usefull aliases for dispatch... +const XStretchedDistributedSolver = DistributedFourierTridiagonalPoissonSolver{<:Any, <:Any, <:XTridiagonalSolver} +const YStretchedDistributedSolver = DistributedFourierTridiagonalPoissonSolver{<:Any, <:Any, <:YTridiagonalSolver} +const ZStretchedDistributedSolver = DistributedFourierTridiagonalPoissonSolver{<:Any, <:Any, <:ZTridiagonalSolver} + +architecture(solver::DistributedFourierTridiagonalPoissonSolver) = + architecture(solver.global_grid) + +@inline Δξᶠ(i, grid, ::Val{1}) = Δxᶠᵃᵃ(i, 1, 1, grid) +@inline Δξᶠ(j, grid, ::Val{2}) = Δyᵃᶠᵃ(1, j, 1, grid) +@inline Δξᶠ(k, grid, ::Val{3}) = Δzᵃᵃᶠ(1, 1, k, grid) + +""" + DistributedFourierTridiagonalPoissonSolver(global_grid, local_grid) + +Return an FFT-based solver for the Poisson equation evaluated on a `local_grid` that has a non-uniform +spacing in exactly one direction (i.e. either in x, y or z) + +```math +∇²φ = b +``` + +for `Distributed` architectures. + +Supported configurations +======================== + +In the following, `Nx`, `Ny`, and `Nz` are the number of grid points of the **global** grid +in the `x`, `y`, and `z` directions, while `Rx`, `Ry`, and `Rz` are the number of ranks in the +`x`, `y`, and `z` directions, respectively. Furthermore, 'pencil' decomposition refers to a domain +decomposed in two different directions (i.e., with `Rx != 1` and `Ry != 1`), while 'slab' decomposition +refers to a domain decomposed only in one direction, (i.e., with either `Rx == 1` or `Ry == 1`). +Additionally, `storage` indicates the `TransposableField` used for storing intermediate results; +see [`TransposableField`](@ref). + +1. Three dimensional configurations with 'pencil' decompositions in ``(x, y)`` +where `Ny ≥ Rx` and `Ny % Rx = 0`, and `Nz ≥ Ry` and `Nz % Ry = 0`. + +2. Two dimensional configurations decomposed in ``x`` where `Ny ≥ Rx` and `Ny % Rx = 0` + +!!! warning "Unsupported decompositions" + _Any_ configuration decomposed in ``z`` direction is _not_ supported. + Furthermore, any ``(x, y)`` decompositions other than the configurations mentioned above are also _not_ supported. + +Algorithm for pencil decompositions +============================================ + +For pencil decompositions (useful for three-dimensional problems), +there are two forward transforms, two backward transforms, one tri-diagonal matrix inversion +and a variable number of transpositions that require MPI communication, dependent on the +stretched direction: + +- a stretching in the x-direction requires four transpositions +- a stretching in the y-direction requires six transpositions +- a stretching in the z-direction requires eight transpositions + +!!! note "Computational cost" + Because of the additional transpositions, a stretching in the x-direction + is computationally cheaper than a stretching in the y-direction, and the latter + is cheaper than a stretching in the z-direction + +In our implementation we require `Nz ≥ Ry` and `Nx ≥ Ry` with the additional constraint +that `Nz % Ry = 0` and `Ny % Rx = 0`. + +x - stretched algorithm +======================== + +1. `storage.zfield`, partitioned over ``(x, y)`` is initialized with the `rhs`. +2. Transform along ``z``. +3. Transpose `storage.zfield` + communicate to `storage.yfield` partitioned into `(Rx, Ry)` processes in ``(x, z)``. +4. Transform along ``y``. +5. Transpose `storage.yfield` + communicate to `storage.xfield` partitioned into `(Rx, Ry)` processes in ``(y, z)``. +6. Solve the tri-diagonal linear system in the ``x`` direction. + +Steps 5 -> 1 are reversed to obtain the result in physical space stored in `storage.zfield` +partitioned over ``(x, y)``. + +y - stretched algorithm +======================== + +1. `storage.zfield`, partitioned over ``(x, y)`` is initialized with the `rhs`. +2. Transform along ``z``. +3. Transpose `storage.zfield` + communicate to `storage.yfield` partitioned into `(Rx, Ry)` processes in ``(x, z)``. +4. Transpose `storage.yfield` + communicate to `storage.xfield` partitioned into `(Rx, Ry)` processes in ``(y, z)``. +5. Transform along ``x``. +6. Transpose `storage.xfield` + communicate to `storage.yfield` partitioned into `(Rx, Ry)` processes in ``(x, z)``. +7. Solve the tri-diagonal linear system in the ``y`` direction. + +Steps 6 -> 1 are reversed to obtain the result in physical space stored in `storage.zfield` +partitioned over ``(x, y)``. + +z - stretched algorithm +======================== + +1. `storage.zfield`, partitioned over ``(x, y)`` is initialized with the `rhs`. +2. Transpose `storage.zfield` + communicate to `storage.yfield` partitioned into `(Rx, Ry)` processes in ``(x, z)``. +3. Transform along ``y``. +4. Transpose `storage.yfield` + communicate to `storage.xfield` partitioned into `(Rx, Ry)` processes in ``(y, z)``. +5. Transform along ``x``. +6. Transpose `storage.xfield` + communicate to `storage.yfield` partitioned into `(Rx, Ry)` processes in ``(x, z)``. +7. Transpose `storage.yfield` + communicate to `storage.zfield` partitioned into `(Rx, Ry)` processes in ``(x, y)``. +8. Solve the tri-diagonal linear system in the ``z`` direction. + +Steps 7 -> 1 are reversed to obtain the result in physical space stored in `storage.zfield` +partitioned over ``(x, y)``. + +Algorithm for slab decompositions +============================= + +The 'slab' decomposition works in the same manner while skipping the transposes that +are not required. For example if the domain is decomposed in ``x``, step 4. and 6. in the above algorithm +are skipped (and the associated reversed step in the backward transform) + +Restrictions +============ + +1. Pencil decompositions: + - `Ny ≥ Rx` and `Ny % Rx = 0` + - `Nz ≥ Ry` and `Nz % Ry = 0` + - If the ``z`` direction is `Periodic`, also the ``y`` and the ``x`` directions must be `Periodic` + - If the ``y`` direction is `Periodic`, also the ``x`` direction must be `Periodic` + +2. Slab decomposition: + - Same as for two-dimensional decompositions with `Rx` (or `Ry`) equal to one + +""" +function DistributedFourierTridiagonalPoissonSolver(global_grid, local_grid, planner_flag=FFTW.PATIENT; tridiagonal_direction = nothing) + + validate_poisson_solver_distributed_grid(global_grid) + validate_poisson_solver_configuration(global_grid, local_grid) + + if isnothing(tridiagonal_direction) + tridiagonal_dim = stretched_dimensions(local_grid)[1] + tridiagonal_direction = stretched_direction(local_grid) + else + tridiagonal_dim = tridiagonal_direction == XDirection() ? 1 : + tridiagonal_direction == YDirection() ? 2 : 3 + end + + topology(global_grid, tridiagonal_dim) != Bounded && + error("`DistributedFourierTridiagonalPoissonSolver` requires that the stretched direction (dimension $tridiagonal_dim) is `Bounded`.") + + FT = Complex{eltype(local_grid)} + child_arch = child_architecture(local_grid) + storage = TransposableField(CenterField(local_grid), FT) + + topo = (TX, TY, TZ) = topology(global_grid) + λx = dropdims(poisson_eigenvalues(global_grid.Nx, global_grid.Lx, 1, TX()), dims=(2, 3)) + λy = dropdims(poisson_eigenvalues(global_grid.Ny, global_grid.Ly, 2, TY()), dims=(1, 3)) + λz = dropdims(poisson_eigenvalues(global_grid.Nz, global_grid.Lz, 3, TZ()), dims=(1, 2)) + + if tridiagonal_dim == 1 + arch = architecture(storage.xfield.grid) + grid = storage.xfield.grid + λ1 = partition_coordinate(λy, size(grid, 2), arch, 2) + λ2 = partition_coordinate(λz, size(grid, 3), arch, 3) + elseif tridiagonal_dim == 2 + arch = architecture(storage.yfield.grid) + grid = storage.yfield.grid + λ1 = partition_coordinate(λx, size(grid, 1), arch, 1) + λ2 = partition_coordinate(λz, size(grid, 3), arch, 3) + elseif tridiagonal_dim == 3 + arch = architecture(storage.zfield.grid) + grid = storage.zfield.grid + λ1 = partition_coordinate(λx, size(grid, 1), arch, 1) + λ2 = partition_coordinate(λy, size(grid, 2), arch, 2) + end + + λ1 = on_architecture(child_arch, λ1) + λ2 = on_architecture(child_arch, λ2) + + plan = plan_distributed_transforms(global_grid, storage, planner_flag) + + # Lower and upper diagonals are the same + lower_diagonal = @allowscalar [ 1 / Δξᶠ(q, grid, Val(tridiagonal_dim)) for q in 2:size(grid, tridiagonal_dim) ] + lower_diagonal = on_architecture(child_arch, lower_diagonal) + upper_diagonal = lower_diagonal + + # Compute diagonal coefficients for each grid point + diagonal = zeros(eltype(grid), size(grid)...) + diagonal = on_architecture(arch, diagonal) + launch_config = if tridiagonal_dim == 1 + :yz + elseif tridiagonal_dim == 2 + :xz + elseif tridiagonal_dim == 3 + :xy + end + + launch!(arch, grid, launch_config, compute_main_diagonal!, diagonal, grid, λ1, λ2, tridiagonal_direction) + + # Set up batched tridiagonal solver + btsolver = BatchedTridiagonalSolver(grid; lower_diagonal, diagonal, upper_diagonal, tridiagonal_direction) + + # We need to permute indices to apply bounded transforms on the GPU (r2r of r2c with twiddling) + x_buffer_needed = child_arch isa GPU && TX == Bounded + z_buffer_needed = child_arch isa GPU && TZ == Bounded + + # We cannot really batch anything, so on GPUs we always have to permute indices in the y direction + y_buffer_needed = child_arch isa GPU + + buffer_x = x_buffer_needed ? on_architecture(child_arch, zeros(FT, size(storage.xfield)...)) : nothing + buffer_y = y_buffer_needed ? on_architecture(child_arch, zeros(FT, size(storage.yfield)...)) : nothing + buffer_z = z_buffer_needed ? on_architecture(child_arch, zeros(FT, size(storage.zfield)...)) : nothing + + buffer = if tridiagonal_dim == 1 + (; y = buffer_y, z = buffer_z) + elseif tridiagonal_dim == 2 + (; x = buffer_x, z = buffer_z) + elseif tridiagonal_dim == 3 + (; x = buffer_x, y = buffer_y) + end + + if tridiagonal_dim == 1 + forward = (y! = plan.forward.y!, z! = plan.forward.z!) + backward = (y! = plan.backward.y!, z! = plan.backward.z!) + elseif tridiagonal_dim == 2 + forward = (x! = plan.forward.x!, z! = plan.forward.z!) + backward = (x! = plan.backward.x!, z! = plan.backward.z!) + elseif tridiagonal_dim == 3 + forward = (x! = plan.forward.x!, y! = plan.forward.y!) + backward = (x! = plan.backward.x!, y! = plan.backward.y!) + end + + plan = (; forward, backward) + + # Storage space for right hand side of Poisson equation + T = complex(eltype(grid)) + source_term = zeros(T, size(grid)...) + source_term = on_architecture(arch, source_term) + + return DistributedFourierTridiagonalPoissonSolver(plan, global_grid, local_grid, btsolver, source_term, storage, buffer) +end + +# solve! requires that `b` in `A x = b` (the right hand side) +# is copied in the solver storage +# See: Models/NonhydrostaticModels/solve_for_pressure.jl +function solve!(x, solver::ZStretchedDistributedSolver) + arch = architecture(solver) + storage = solver.storage + buffer = solver.buffer + + transpose_z_to_y!(storage) # copy data from storage.zfield to storage.yfield + solver.plan.forward.y!(parent(storage.yfield), buffer.y) + transpose_y_to_x!(storage) # copy data from storage.yfield to storage.xfield + solver.plan.forward.x!(parent(storage.xfield), buffer.x) + transpose_x_to_y!(storage) # copy data from storage.xfield to storage.yfield + transpose_y_to_z!(storage) # copy data from storage.yfield to storage.zfield + + # copy results in the source term + parent(solver.source_term) .= parent(storage.zfield) + + # Perform the implicit vertical solve here on storage.zfield... + # Solve tridiagonal system of linear equations at every z-column. + solve!(storage.zfield, solver.batched_tridiagonal_solver, solver.source_term) + + transpose_z_to_y!(storage) + transpose_y_to_x!(storage) # copy data from storage.yfield to storage.xfield + solver.plan.backward.x!(parent(storage.xfield), buffer.x) + transpose_x_to_y!(storage) # copy data from storage.xfield to storage.yfield + solver.plan.backward.y!(parent(storage.yfield), buffer.y) + transpose_y_to_z!(storage) # copy data from storage.yfield to storage.zfield + + # Copy the real component of xc to x. + launch!(arch, solver.local_grid, :xyz, + _copy_real_component!, x, parent(storage.zfield)) + + return x +end + +function solve!(x, solver::YStretchedDistributedSolver) + arch = architecture(solver) + storage = solver.storage + buffer = solver.buffer + + solver.plan.forward.z!(parent(storage.zfield), buffer.z) + transpose_z_to_y!(storage) # copy data from storage.zfield to storage.yfield + transpose_y_to_x!(storage) # copy data from storage.yfield to storage.xfield + solver.plan.forward.x!(parent(storage.xfield), buffer.x) + transpose_x_to_y!(storage) # copy data from storage.xfield to storage.yfield + + # copy results in the source term + parent(solver.source_term) .= parent(storage.yfield) + + # Perform the implicit vertical solve here on storage.yfield... + # Solve tridiagonal system of linear equations at every y-column. + solve!(storage.yfield, solver.batched_tridiagonal_solver, solver.source_term) + + transpose_y_to_x!(storage) # copy data from storage.yfield to storage.xfield + solver.plan.backward.x!(parent(storage.xfield), buffer.x) + transpose_x_to_y!(storage) # copy data from storage.xfield to storage.yfield + transpose_y_to_z!(storage) # copy data from storage.yfield to storage.zfield + solver.plan.backward.z!(parent(storage.zfield), buffer.z) + + # Copy the real component of xc to x. + launch!(arch, solver.local_grid, :xyz, + _copy_real_component!, x, parent(storage.zfield)) + + return x +end + +function solve!(x, solver::XStretchedDistributedSolver) + arch = architecture(solver) + storage = solver.storage + buffer = solver.buffer + + # Apply forward transforms to b = first(solver.storage). + solver.plan.forward.z!(parent(storage.zfield), buffer.z) + transpose_z_to_y!(storage) # copy data from storage.zfield to storage.yfield + solver.plan.forward.y!(parent(storage.yfield), buffer.y) + transpose_y_to_x!(storage) # copy data from storage.yfield to storage.xfield + + # copy results in the source term + parent(solver.source_term) .= parent(storage.xfield) + + # Perform the implicit vertical solve here on storage.xfield... + # Solve tridiagonal system of linear equations at every x-column. + solve!(storage.xfield, solver.batched_tridiagonal_solver, solver.source_term) + + transpose_x_to_y!(storage) # copy data from storage.xfield to storage.yfield + solver.plan.backward.y!(parent(storage.yfield), buffer.y) + transpose_y_to_z!(storage) # copy data from storage.yfield to storage.zfield + solver.plan.backward.z!(parent(storage.zfield), buffer.z) # last backwards transform is in z + + # Copy the real component of xc to x. + launch!(arch, solver.local_grid, :xyz, + _copy_real_component!, x, parent(storage.zfield)) + + return x +end \ No newline at end of file diff --git a/src/DistributedComputations/distributed_grids.jl b/src/DistributedComputations/distributed_grids.jl index 5f5dee5ef3..686f9c9541 100644 --- a/src/DistributedComputations/distributed_grids.jl +++ b/src/DistributedComputations/distributed_grids.jl @@ -84,7 +84,6 @@ function RectilinearGrid(arch::Distributed, xl = Rx == 1 ? x : partition_coordinate(x, nx, arch, 1) yl = Ry == 1 ? y : partition_coordinate(y, ny, arch, 2) zl = Rz == 1 ? z : partition_coordinate(z, nz, arch, 3) - Lx, xᶠᵃᵃ, xᶜᵃᵃ, Δxᶠᵃᵃ, Δxᶜᵃᵃ = generate_coordinate(FT, topology[1](), nx, Hx, xl, :x, child_architecture(arch)) Ly, yᵃᶠᵃ, yᵃᶜᵃ, Δyᵃᶠᵃ, Δyᵃᶜᵃ = generate_coordinate(FT, topology[2](), ny, Hy, yl, :y, child_architecture(arch)) Lz, zᵃᵃᶠ, zᵃᵃᶜ, Δzᵃᵃᶠ, Δzᵃᵃᶜ = generate_coordinate(FT, topology[3](), nz, Hz, zl, :z, child_architecture(arch)) @@ -190,9 +189,9 @@ function reconstruct_global_grid(grid::DistributedRectilinearGrid) z = cpu_face_constructor_z(grid) ## This will not work with 3D parallelizations!! - xG = Rx == 1 ? x : assemble_coordinate(x, nx, Rx, ri, rj, rk, arch.communicator) - yG = Ry == 1 ? y : assemble_coordinate(y, ny, Ry, rj, ri, rk, arch.communicator) - zG = Rz == 1 ? z : assemble_coordinate(z, nz, Rz, rk, ri, rj, arch.communicator) + xG = Rx == 1 ? x : assemble_coordinate(x, nx, arch, 1) + yG = Ry == 1 ? y : assemble_coordinate(y, ny, arch, 2) + zG = Rz == 1 ? z : assemble_coordinate(z, nz, arch, 3) child_arch = child_architecture(arch) @@ -233,9 +232,9 @@ function reconstruct_global_grid(grid::DistributedLatitudeLongitudeGrid) z = cpu_face_constructor_z(grid) ## This will not work with 3D parallelizations!! - λG = Rx == 1 ? λ : assemble_coordinate(λ, nλ, Rx, ri, rj, rk, arch.communicator) - φG = Ry == 1 ? φ : assemble_coordinate(φ, nφ, Ry, rj, ri, rk, arch.communicator) - zG = Rz == 1 ? z : assemble_coordinate(z, nz, Rz, rk, ri, rj, arch.communicator) + λG = Rx == 1 ? λ : assemble_coordinate(λ, nλ, arch, 1) + φG = Ry == 1 ? φ : assemble_coordinate(φ, nφ, arch, 2) + zG = Rz == 1 ? z : assemble_coordinate(z, nz, arch, 3) child_arch = child_architecture(arch) diff --git a/src/DistributedComputations/partition_assemble.jl b/src/DistributedComputations/partition_assemble.jl index 6502901f64..c56d0b2f22 100644 --- a/src/DistributedComputations/partition_assemble.jl +++ b/src/DistributedComputations/partition_assemble.jl @@ -43,11 +43,12 @@ function partition_coordinate(c::AbstractVector, n, arch, idx) r = arch.local_index[idx] start_idx = sum(nl[1:r-1]) + 1 # sum of all previous rank's dimension + 1 - end_idx = if r == arch.ranks[idx] - length(c) + end_idx = if r == ranks(arch)[idx] + length(c) else sum(nl[1:r]) + 1 end + return c[start_idx : end_idx] end @@ -74,32 +75,40 @@ a local number of elements `Nc`, number of ranks `Nr`, rank `r`, and `arch`itecture. Since we use a global reduction, only ranks at positions 1 in the other two directions `r1 == 1` and `r2 == 1` fill the 1D array. """ -function assemble_coordinate(c_local::AbstractVector, n, R, r, r1, r2, comm) - nl = concatenate_local_sizes(n, R, r) +function assemble_coordinate(c_local::AbstractVector, n, arch, idx) + nl = concatenate_local_sizes(n, arch, idx) + R = arch.ranks[idx] + r = arch.local_index[idx] + r2 = [arch.local_index[i] for i in filter(x -> x != idx, (1, 2, 3))] c_global = zeros(eltype(c_local), sum(nl)+1) - if r1 == 1 && r2 == 1 + if r2[1] == 1 && r2[2] == 1 c_global[1 + sum(nl[1:r-1]) : sum(nl[1:r])] .= c_local[1:end-1] - r == Nr && (c_global[end] = c_local[end]) + r == R && (c_global[end] = c_local[end]) end - MPI.Allreduce!(c_global, +, comm) + MPI.Allreduce!(c_global, +, arch.communicator) return c_global end # Simple case, just take the first and the last core -function assemble_coordinate(c::Tuple, n, R, r, r1, r2, comm) +function assemble_coordinate(c_local::Tuple, n, arch, idx) c_global = zeros(Float64, 2) - - if r == 1 && r1 == 1 && r2 == 1 - c_global[1] = c[1] - elseif r == R && r1 == 1 && r2 == 1 - c_global[2] = c[2] + + rank = arch.local_index + R = arch.ranks[idx] + r = rank[idx] + r2 = [rank[i] for i in filter(x -> x != idx, (1, 2, 3))] + + if rank[1] == 1 && rank[2] == 1 && rank[3] == 1 + c_global[1] = c_local[1] + elseif r == R && r2[1] == 1 && r2[1] == 1 + c_global[2] = c_local[2] end - MPI.Allreduce!(c_global, +, comm) + MPI.Allreduce!(c_global, +, arch.communicator) return tuple(c_global...) end diff --git a/src/DistributedComputations/transposable_field.jl b/src/DistributedComputations/transposable_field.jl index 183846b798..be630cc46a 100644 --- a/src/DistributedComputations/transposable_field.jl +++ b/src/DistributedComputations/transposable_field.jl @@ -138,9 +138,9 @@ function twin_grid(grid::DistributedGrid; local_direction = :y) y = cpu_face_constructor_y(grid) z = cpu_face_constructor_z(grid) - xG = R[1] == 1 ? x : assemble_coordinate(x, nx, R[1], ri, rj, rk, arch.communicator) - yG = R[2] == 1 ? y : assemble_coordinate(y, ny, R[2], rj, ri, rk, arch.communicator) - zG = R[3] == 1 ? z : assemble_coordinate(z, nz, R[3], rk, ri, rj, arch.communicator) + xG = R[1] == 1 ? x : assemble_coordinate(x, nx, arch, 1) + yG = R[2] == 1 ? y : assemble_coordinate(y, ny, arch, 2) + zG = R[3] == 1 ? z : assemble_coordinate(z, nz, arch, 3) child_arch = child_architecture(arch) diff --git a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl index 5fccd29f82..57be90b7f0 100644 --- a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl +++ b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl @@ -10,7 +10,9 @@ using Oceananigans.Utils using Oceananigans.Grids using Oceananigans.Solvers -using Oceananigans.DistributedComputations: Distributed, DistributedFFTBasedPoissonSolver, reconstruct_global_grid +using Oceananigans.DistributedComputations +using Oceananigans.DistributedComputations: reconstruct_global_grid, Distributed +using Oceananigans.DistributedComputations: DistributedFFTBasedPoissonSolver, DistributedFourierTridiagonalPoissonSolver using Oceananigans.Grids: XYRegularRG, XZRegularRG, YZRegularRG, XYZRegularRG using Oceananigans.ImmersedBoundaries: ImmersedBoundaryGrid using Oceananigans.Utils: SumOfArrays @@ -19,11 +21,26 @@ import Oceananigans: fields, prognostic_fields import Oceananigans.Advection: cell_advection_timescale import Oceananigans.TimeSteppers: step_lagrangian_particles! -function PressureSolver(arch::Distributed, local_grid::XYZRegularRG) +function PressureSolver(::Distributed, local_grid::XYZRegularRG) global_grid = reconstruct_global_grid(local_grid) return DistributedFFTBasedPoissonSolver(global_grid, local_grid) end +function PressureSolver(::Distributed, local_grid::XYRegularRG) + global_grid = reconstruct_global_grid(local_grid) + return DistributedFourierTridiagonalPoissonSolver(global_grid, local_grid) +end + +function PressureSolver(::Distributed, local_grid::YZRegularRG) + global_grid = reconstruct_global_grid(local_grid) + return DistributedFourierTridiagonalPoissonSolver(global_grid, local_grid) +end + +function PressureSolver(::Distributed, local_grid::XZRegularRG) + global_grid = reconstruct_global_grid(local_grid) + return DistributedFourierTridiagonalPoissonSolver(global_grid, local_grid) +end + PressureSolver(arch, grid::XYZRegularRG) = FFTBasedPoissonSolver(grid) PressureSolver(arch, grid::XYRegularRG) = FourierTridiagonalPoissonSolver(grid) PressureSolver(arch, grid::XZRegularRG) = FourierTridiagonalPoissonSolver(grid) diff --git a/src/Models/NonhydrostaticModels/solve_for_pressure.jl b/src/Models/NonhydrostaticModels/solve_for_pressure.jl index 10154fc91e..5064a2c5d6 100644 --- a/src/Models/NonhydrostaticModels/solve_for_pressure.jl +++ b/src/Models/NonhydrostaticModels/solve_for_pressure.jl @@ -61,6 +61,22 @@ function solve_for_pressure!(pressure, solver::FFTBasedPoissonSolver, Δt, U★) return nothing end +function solve_for_pressure!(pressure, solver::DistributedFourierTridiagonalPoissonSolver, Δt, U★) + + # Calculate right hand side: + rhs = solver.storage.zfield + arch = architecture(solver) + grid = solver.local_grid + + launch!(arch, grid, :xyz, calculate_pressure_source_term_fourier_tridiagonal_solver!, + rhs, grid, Δt, U★, solver.batched_tridiagonal_solver.tridiagonal_direction) + + # Pressure Poisson rhs, scaled by the spacing in the stretched direction at ᶜᶜᶜ, is stored in solver.source_term: + solve!(pressure, solver) + + return nothing +end + function solve_for_pressure!(pressure, solver::FourierTridiagonalPoissonSolver, Δt, U★) # Calculate right hand side: diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl index 6316fc15d2..2886847938 100644 --- a/src/Solvers/batched_tridiagonal_solver.jl +++ b/src/Solvers/batched_tridiagonal_solver.jl @@ -18,8 +18,12 @@ struct BatchedTridiagonalSolver{A, B, C, T, G, P, D} tridiagonal_direction :: D end -architecture(solver::BatchedTridiagonalSolver) = architecture(solver.grid) +# Some aliases... +const XTridiagonalSolver = BatchedTridiagonalSolver{A, B, C, T, G, P, <:XDirection} where {A, B, C, T, G, P} +const YTridiagonalSolver = BatchedTridiagonalSolver{A, B, C, T, G, P, <:YDirection} where {A, B, C, T, G, P} +const ZTridiagonalSolver = BatchedTridiagonalSolver{A, B, C, T, G, P, <:ZDirection} where {A, B, C, T, G, P} +architecture(solver::BatchedTridiagonalSolver) = architecture(solver.grid) """ BatchedTridiagonalSolver(grid; diff --git a/test/test_distributed_poisson_solvers.jl b/test/test_distributed_poisson_solvers.jl index c5613ae9a7..dca406d8c3 100644 --- a/test/test_distributed_poisson_solvers.jl +++ b/test/test_distributed_poisson_solvers.jl @@ -30,7 +30,7 @@ include("dependencies_for_poisson_solvers.jl") # to initialize MPI. -using Oceananigans.DistributedComputations: reconstruct_global_grid, DistributedGrid, Partition +using Oceananigans.DistributedComputations: reconstruct_global_grid, DistributedGrid, Partition, DistributedFourierTridiagonalPoissonSolver using Oceananigans.Models.NonhydrostaticModels: solve_for_pressure! function random_divergent_source_term(grid::DistributedGrid) @@ -88,6 +88,39 @@ function divergence_free_poisson_solution(grid_points, ranks, topo, child_arch) return Array(interior(∇²ϕ)) ≈ Array(R) end +function divergence_free_poisson_tridiagonal_solution(grid_points, ranks, stretched_direction, child_arch) + arch = Distributed(child_arch, partition=Partition(ranks...)) + + if stretched_direction == :x + x = collect(range(0, 2π, length = grid_points[1]+1)) + y = z = (0, 2π) + elseif stretched_direction == :y + y = collect(range(0, 2π, length = grid_points[2]+1)) + x = z = (0, 2π) + elseif stretched_direction == :z + z = collect(range(0, 2π, length = grid_points[3]+1)) + x = y = (0, 2π) + end + + local_grid = RectilinearGrid(arch; topology=(Bounded, Bounded, Bounded), size=grid_points, x, y, z) + + # The test will solve for ϕ, then compare R to ∇²ϕ. + ϕ = CenterField(local_grid) + ∇²ϕ = CenterField(local_grid) + R, U = random_divergent_source_term(local_grid) + + global_grid = reconstruct_global_grid(local_grid) + solver = DistributedFourierTridiagonalPoissonSolver(global_grid, local_grid) + + # Using Δt = 1. + solve_for_pressure!(ϕ, solver, 1, U) + + # "Recompute" ∇²ϕ + compute_∇²!(∇²ϕ, ϕ, arch, local_grid) + + return Array(interior(∇²ϕ)) ≈ Array(R) +end + @testset "Distributed FFT-based Poisson solver" begin child_arch = test_child_arch() @@ -97,16 +130,30 @@ end (Bounded, Bounded, Bounded)) @info " Testing 3D distributed FFT-based Poisson solver with topology $topology..." - @show @test divergence_free_poisson_solution((44, 44, 8), (4, 1, 1), topology, child_arch) - @show @test divergence_free_poisson_solution((16, 44, 8), (4, 1, 1), topology, child_arch) - @show @test divergence_free_poisson_solution((44, 44, 8), (1, 4, 1), topology, child_arch) - @show @test divergence_free_poisson_solution((44, 16, 8), (1, 4, 1), topology, child_arch) - @show @test divergence_free_poisson_solution((16, 44, 8), (1, 4, 1), topology, child_arch) - @show @test divergence_free_poisson_solution((22, 44, 8), (2, 2, 1), topology, child_arch) - @show @test divergence_free_poisson_solution((44, 22, 8), (2, 2, 1), topology, child_arch) + @test divergence_free_poisson_solution((44, 44, 8), (4, 1, 1), topology, child_arch) + @test divergence_free_poisson_solution((16, 44, 8), (4, 1, 1), topology, child_arch) + @test divergence_free_poisson_solution((44, 44, 8), (1, 4, 1), topology, child_arch) + @test divergence_free_poisson_solution((44, 16, 8), (1, 4, 1), topology, child_arch) + @test divergence_free_poisson_solution((16, 44, 8), (1, 4, 1), topology, child_arch) + @test divergence_free_poisson_solution((22, 44, 8), (2, 2, 1), topology, child_arch) + @test divergence_free_poisson_solution((44, 22, 8), (2, 2, 1), topology, child_arch) @info " Testing 2D distributed FFT-based Poisson solver with topology $topology..." - @show @test divergence_free_poisson_solution((44, 16, 1), (4, 1, 1), topology, child_arch) - @show @test divergence_free_poisson_solution((16, 44, 1), (4, 1, 1), topology, child_arch) + @test divergence_free_poisson_solution((44, 16, 1), (4, 1, 1), topology, child_arch) + @test divergence_free_poisson_solution((16, 44, 1), (4, 1, 1), topology, child_arch) + end + + for stretched_direction in (:x, :y, :z) + @info " Testing 3D distributed Fourier Tridiagonal Poisson solver stretched in $stretched_direction" + @test divergence_free_poisson_tridiagonal_solution((44, 44, 8), (1, 4, 1), stretched_direction, child_arch) + @test divergence_free_poisson_tridiagonal_solution((44, 4, 8), (1, 4, 1), stretched_direction, child_arch) + @test divergence_free_poisson_tridiagonal_solution((16, 44, 8), (1, 4, 1), stretched_direction, child_arch) + @test divergence_free_poisson_tridiagonal_solution((22, 8, 8), (2, 2, 1), stretched_direction, child_arch) + @test divergence_free_poisson_tridiagonal_solution(( 8, 22, 8), (2, 2, 1), stretched_direction, child_arch) + @test divergence_free_poisson_tridiagonal_solution((44, 44, 8), (1, 4, 1), stretched_direction, child_arch) + @test divergence_free_poisson_tridiagonal_solution((44, 4, 8), (1, 4, 1), stretched_direction, child_arch) + @test divergence_free_poisson_tridiagonal_solution((16, 44, 8), (1, 4, 1), stretched_direction, child_arch) + @test divergence_free_poisson_tridiagonal_solution((22, 8, 8), (2, 2, 1), stretched_direction, child_arch) + @test divergence_free_poisson_tridiagonal_solution(( 8, 22, 8), (2, 2, 1), stretched_direction, child_arch) end end diff --git a/test/test_nonhydrostatic_regression.jl b/test/test_nonhydrostatic_regression.jl index deb9c5d72c..2d50afdfe2 100644 --- a/test/test_nonhydrostatic_regression.jl +++ b/test/test_nonhydrostatic_regression.jl @@ -53,11 +53,9 @@ include("regression_tests/ocean_large_eddy_simulation_regression_test.jl") A = typeof(arch) for grid_type in [:regular, :vertically_unstretched] - if !(arch isa Distributed && grid_type == :vertically_unstretched) - @testset "Rayleigh–Bénard tracer [$A, $grid_type grid]]" begin - @info " Testing Rayleigh–Bénard tracer regression [$A, $grid_type grid]" - run_rayleigh_benard_regression_test(arch, grid_type) - end + @testset "Rayleigh–Bénard tracer [$A, $grid_type grid]]" begin + @info " Testing Rayleigh–Bénard tracer regression [$A, $grid_type grid]" + run_rayleigh_benard_regression_test(arch, grid_type) end if !(arch isa Distributed)