diff --git a/.buildkite/distributed/pipeline.yml b/.buildkite/distributed/pipeline.yml
index 3787a65d1b..c8a2a15a63 100644
--- a/.buildkite/distributed/pipeline.yml
+++ b/.buildkite/distributed/pipeline.yml
@@ -86,7 +86,7 @@ steps:
       slurm_mem: 120G
       slurm_ntasks: 4
 
-  - label: "🕺 gpu distributed hydrostatic model tests"
+  - label: "🦏 gpu distributed hydrostatic model tests"
     key: "distributed_hydrostatic_model_gpu"
     env:
       TEST_GROUP: "distributed_hydrostatic_model"
@@ -97,7 +97,7 @@ steps:
       slurm_ntasks: 4
       slurm_gpus_per_task: 1
 
-  - label: "🤺 cpu distributed nonhydrostatic regression"
+  - label: "🦍 cpu distributed nonhydrostatic regression"
     key: "distributed_nonhydrostatic_regression_cpu"
     env:
       TEST_GROUP: "distributed_nonhydrostatic_regression"
diff --git a/src/DistributedComputations/DistributedComputations.jl b/src/DistributedComputations/DistributedComputations.jl
index 0f5f28007c..f3c1880d61 100644
--- a/src/DistributedComputations/DistributedComputations.jl
+++ b/src/DistributedComputations/DistributedComputations.jl
@@ -23,5 +23,6 @@ include("transposable_field.jl")
 include("distributed_transpose.jl")
 include("plan_distributed_transforms.jl")
 include("distributed_fft_based_poisson_solver.jl")
+include("distributed_fft_tridiagonal_solver.jl")
 
 end # module
diff --git a/src/DistributedComputations/distributed_architectures.jl b/src/DistributedComputations/distributed_architectures.jl
index b1647747be..b69c58da19 100644
--- a/src/DistributedComputations/distributed_architectures.jl
+++ b/src/DistributedComputations/distributed_architectures.jl
@@ -24,7 +24,7 @@ end
 Return `Partition` representing the division of a domain in
 the `x` (first), `y` (second) and `z` (third) dimension
 
-Keyword arguments: 
+Keyword arguments:
 ==================
 
 - `x`: partitioning of the first dimension 
diff --git a/src/DistributedComputations/distributed_fft_based_poisson_solver.jl b/src/DistributedComputations/distributed_fft_based_poisson_solver.jl
index 7c259b9e3d..c73167e10d 100644
--- a/src/DistributedComputations/distributed_fft_based_poisson_solver.jl
+++ b/src/DistributedComputations/distributed_fft_based_poisson_solver.jl
@@ -1,7 +1,7 @@
 import FFTW 
 
 using CUDA: @allowscalar
-using Oceananigans.Grids: XYZRegularRG
+using Oceananigans.Grids: XYZRegularRG, XYRegularRG, XZRegularRG, YZRegularRG
 
 import Oceananigans.Solvers: poisson_eigenvalues, solve!
 import Oceananigans.Architectures: architecture
@@ -60,9 +60,9 @@ In the algorithm below, the first dimension is always the local dimension. In ou
 
 1. `storage.zfield`, partitioned over ``(x, y)`` is initialized with the `rhs` that is ``b``.
 2. Transform along ``z``.
-3  Transpose + communicate to `storage.yfield` partitioned into `(Rx, Ry)` processes in ``(x, z)``.
+3  Transpose `storage.zfield` + communicate to `storage.yfield` partitioned into `(Rx, Ry)` processes in ``(x, z)``.
 4. Transform along ``y``.
-5. Transpose + communicate to `storage.xfield` partitioned into `(Rx, Ry)` processes in ``(y, z)``.
+5. Transpose `storage.yfield` + communicate to `storage.xfield` partitioned into `(Rx, Ry)` processes in ``(y, z)``.
 6. Transform in ``x``.
 
 At this point the three in-place forward transforms are complete, and we
@@ -91,8 +91,8 @@ Restrictions
 """
 function DistributedFFTBasedPoissonSolver(global_grid, local_grid, planner_flag=FFTW.PATIENT)
 
-    validate_global_grid(global_grid)
-    validate_configuration(global_grid, local_grid)
+    validate_poisson_solver_distributed_grid(global_grid)
+    validate_poisson_solver_configuration(global_grid, local_grid)
 
     FT = Complex{eltype(local_grid)}
 
@@ -188,38 +188,41 @@ end
 end
 
 # TODO: bring up to speed the PCG to remove this error
-validate_global_grid(global_grid) = 
+validate_poisson_solver_distributed_grid(global_grid) = 
         throw("Grids other than the RectilinearGrid are not supported in the Distributed NonhydrostaticModels")
 
-function validate_global_grid(global_grid::RectilinearGrid) 
+function validate_poisson_solver_distributed_grid(global_grid::RectilinearGrid) 
     TX, TY, TZ = topology(global_grid)
 
     if (TY == Bounded && TZ == Periodic) || (TX == Bounded && TY == Periodic) || (TX == Bounded && TZ == Periodic)
-        throw("NonhydrostaticModels on Distributed grids do not support topology ($TX, $TY, $TZ) at the moment.
-               TZ Periodic requires also TY and TX to be Periodic, while TY Periodic requires also TX to be Periodic.
-               Please rotate the domain to obtain the required topology")
+        throw("Distributed Poisson solvers do not support grids with topology ($TX, $TY, $TZ) at the moment.
+               A Periodic z-direction requires also the y- and and x-directions to be Periodic, while a Periodic y-direction requires also 
+               the x-direction to be Periodic.")
     end
     
-    if !(global_grid isa XYZRegularRG) 
-        throw("Stretched directions are not supported with distributed grids at the moment.")
+    if !(global_grid isa YZRegularRG) && !(global_grid isa XYRegularRG) && !(global_grid isa XZRegularRG) 
+        throw("The provided grid is stretched in directions $(stretched_dimensions(global_grid)). 
+               A distributed Poisson solver supports only RectilinearGrids that have variably-spaced cells in at most one direction.")
     end
 
     return nothing
 end
 
-function validate_configuration(global_grid, local_grid)
+function validate_poisson_solver_configuration(global_grid, local_grid)
         
     # We don't support distributing anything in z.
-    Rz = architecture(local_grid).ranks[3]
-    Rz == 1 || throw("Non-singleton ranks in the vertical are not supported by DistributedFFTBasedPoissonSolver.")
+    Rx, Ry, Rz = architecture(local_grid).ranks
+    Rz == 1 || throw("Non-singleton ranks in the vertical are not supported by distributed Poisson solvers.")
     
     # Limitation of the current implementation (see the docstring)
-    if global_grid.Nz % architecture(local_grid).ranks[2] != 0
-        throw("The number of ranks in the y direction must divide Nz. See the docstring for more information.")
+    if global_grid.Nz % Ry != 0
+        throw("The number of ranks in the y-direction are $(Ry) with Nz = $(global_grid.Nz) cells in the z-direction.
+               The distributed Poisson solver requires that the number of ranks in the y-direction divide Nz.")
     end
 
-    if global_grid.Ny % architecture(local_grid).ranks[1] != 0
-        throw("The number of ranks in the x direction must divide Ny. See the docstring for more information.")
+    if global_grid.Ny % Rx != 0
+        throw("The number of ranks in the y-direction are $(Rx) with Ny = $(global_grid.Ny) cells in the y-direction.
+               The distributed Poisson solver requires that the number of ranks in the x-direction divide Ny.")
     end
 
     return nothing
diff --git a/src/DistributedComputations/distributed_fft_tridiagonal_solver.jl b/src/DistributedComputations/distributed_fft_tridiagonal_solver.jl
new file mode 100644
index 0000000000..8911cb98d8
--- /dev/null
+++ b/src/DistributedComputations/distributed_fft_tridiagonal_solver.jl
@@ -0,0 +1,352 @@
+using CUDA: @allowscalar
+using Oceananigans.Grids: stretched_dimensions
+using Oceananigans.Grids: XDirection, YDirection
+using Oceananigans.Operators: Δxᶠᵃᵃ, Δyᵃᶠᵃ, Δzᵃᵃᶠ
+
+using Oceananigans.Solvers: BatchedTridiagonalSolver, 
+                            stretched_direction, 
+                            ZTridiagonalSolver, 
+                            YTridiagonalSolver, 
+                            XTridiagonalSolver,
+                            compute_main_diagonal!
+
+struct DistributedFourierTridiagonalPoissonSolver{G, L, B, P, R, S, β} 
+    plan :: P              
+    global_grid :: G
+    local_grid :: L
+    batched_tridiagonal_solver :: B
+    source_term :: R
+    storage :: S
+    buffer :: β 
+end
+
+# Usefull aliases for dispatch...
+const XStretchedDistributedSolver = DistributedFourierTridiagonalPoissonSolver{<:Any, <:Any, <:XTridiagonalSolver}
+const YStretchedDistributedSolver = DistributedFourierTridiagonalPoissonSolver{<:Any, <:Any, <:YTridiagonalSolver}
+const ZStretchedDistributedSolver = DistributedFourierTridiagonalPoissonSolver{<:Any, <:Any, <:ZTridiagonalSolver}
+
+architecture(solver::DistributedFourierTridiagonalPoissonSolver) =
+    architecture(solver.global_grid)
+
+@inline Δξᶠ(i, grid, ::Val{1}) = Δxᶠᵃᵃ(i, 1, 1, grid)
+@inline Δξᶠ(j, grid, ::Val{2}) = Δyᵃᶠᵃ(1, j, 1, grid)
+@inline Δξᶠ(k, grid, ::Val{3}) = Δzᵃᵃᶠ(1, 1, k, grid)
+    
+"""
+    DistributedFourierTridiagonalPoissonSolver(global_grid, local_grid)
+
+Return an FFT-based solver for the Poisson equation evaluated on a `local_grid` that has a non-uniform
+spacing in exactly one direction (i.e. either in x, y or z)
+
+```math
+∇²φ = b
+```
+
+for `Distributed` architectures.
+
+Supported configurations
+========================
+
+In the following, `Nx`, `Ny`, and `Nz` are the number of grid points of the **global** grid
+in the `x`, `y`, and `z` directions, while `Rx`, `Ry`, and `Rz` are the number of ranks in the
+`x`, `y`, and `z` directions, respectively. Furthermore, 'pencil' decomposition refers to a domain 
+decomposed in two different directions (i.e., with `Rx != 1` and `Ry != 1`), while 'slab' decomposition 
+refers to a domain decomposed only in one direction, (i.e., with either `Rx == 1` or `Ry == 1`).
+Additionally, `storage` indicates the `TransposableField` used for storing intermediate results;
+see [`TransposableField`](@ref).
+
+1. Three dimensional configurations with 'pencil' decompositions in ``(x, y)`` 
+where `Ny ≥ Rx` and `Ny % Rx = 0`, and `Nz ≥ Ry` and `Nz % Ry = 0`.
+
+2. Two dimensional configurations decomposed in ``x`` where `Ny ≥ Rx` and `Ny % Rx = 0`
+    
+!!! warning "Unsupported decompositions"
+    _Any_ configuration decomposed in ``z`` direction is _not_ supported.
+    Furthermore, any ``(x, y)`` decompositions other than the configurations mentioned above are also _not_ supported.
+    
+Algorithm for pencil decompositions
+============================================
+
+For pencil decompositions (useful for three-dimensional problems),
+there are two forward transforms, two backward transforms, one tri-diagonal matrix inversion
+and a variable number of transpositions that require MPI communication, dependent on the 
+stretched direction:
+
+- a stretching in the x-direction requires four transpositions
+- a stretching in the y-direction requires six transpositions
+- a stretching in the z-direction requires eight transpositions
+
+!!! note "Computational cost"
+    Because of the additional transpositions, a stretching in the x-direction
+    is computationally cheaper than a stretching in the y-direction, and the latter
+    is cheaper than a stretching in the z-direction
+
+In our implementation we require `Nz ≥ Ry` and `Nx ≥ Ry` with the additional constraint 
+that `Nz % Ry = 0` and `Ny % Rx = 0`.
+
+x - stretched algorithm
+========================
+
+1. `storage.zfield`, partitioned over ``(x, y)`` is initialized with the `rhs`.
+2. Transform along ``z``.
+3. Transpose `storage.zfield` + communicate to `storage.yfield` partitioned into `(Rx, Ry)` processes in ``(x, z)``.
+4. Transform along ``y``.
+5. Transpose `storage.yfield` + communicate to `storage.xfield` partitioned into `(Rx, Ry)` processes in ``(y, z)``.
+6. Solve the tri-diagonal linear system in the ``x`` direction.
+
+Steps 5 -> 1 are reversed to obtain the result in physical space stored in `storage.zfield` 
+partitioned over ``(x, y)``.
+
+y - stretched algorithm
+========================
+
+1. `storage.zfield`, partitioned over ``(x, y)`` is initialized with the `rhs`.
+2. Transform along ``z``.
+3. Transpose `storage.zfield` + communicate to `storage.yfield` partitioned into `(Rx, Ry)` processes in ``(x, z)``.
+4. Transpose `storage.yfield` + communicate to `storage.xfield` partitioned into `(Rx, Ry)` processes in ``(y, z)``.
+5. Transform along ``x``.
+6. Transpose `storage.xfield` + communicate to `storage.yfield` partitioned into `(Rx, Ry)` processes in ``(x, z)``.
+7. Solve the tri-diagonal linear system in the ``y`` direction.
+
+Steps 6 -> 1 are reversed to obtain the result in physical space stored in `storage.zfield` 
+partitioned over ``(x, y)``.
+
+z - stretched algorithm
+========================
+
+1. `storage.zfield`, partitioned over ``(x, y)`` is initialized with the `rhs`.
+2. Transpose `storage.zfield` + communicate to `storage.yfield` partitioned into `(Rx, Ry)` processes in ``(x, z)``.
+3. Transform along ``y``.
+4. Transpose `storage.yfield` + communicate to `storage.xfield` partitioned into `(Rx, Ry)` processes in ``(y, z)``.
+5. Transform along ``x``.
+6. Transpose `storage.xfield` + communicate to `storage.yfield` partitioned into `(Rx, Ry)` processes in ``(x, z)``.
+7. Transpose `storage.yfield` + communicate to `storage.zfield` partitioned into `(Rx, Ry)` processes in ``(x, y)``.
+8. Solve the tri-diagonal linear system in the ``z`` direction.
+
+Steps 7 -> 1 are reversed to obtain the result in physical space stored in `storage.zfield` 
+partitioned over ``(x, y)``.
+
+Algorithm for slab decompositions
+=============================
+
+The 'slab' decomposition works in the same manner while skipping the transposes that
+are not required. For example if the domain is decomposed in ``x``, step 4. and 6. in the above algorithm
+are skipped (and the associated reversed step in the backward transform)
+
+Restrictions
+============
+
+1. Pencil decompositions:
+    - `Ny ≥ Rx` and `Ny % Rx = 0`
+    - `Nz ≥ Ry` and `Nz % Ry = 0`
+    - If the ``z`` direction is `Periodic`, also the ``y`` and the ``x`` directions must be `Periodic`
+    - If the ``y`` direction is `Periodic`, also the ``x`` direction must be `Periodic`
+
+2. Slab decomposition:
+    - Same as for two-dimensional decompositions with `Rx` (or `Ry`) equal to one
+
+"""
+function DistributedFourierTridiagonalPoissonSolver(global_grid, local_grid, planner_flag=FFTW.PATIENT; tridiagonal_direction = nothing)
+    
+    validate_poisson_solver_distributed_grid(global_grid)
+    validate_poisson_solver_configuration(global_grid, local_grid)
+ 
+    if isnothing(tridiagonal_direction) 
+        tridiagonal_dim = stretched_dimensions(local_grid)[1]
+        tridiagonal_direction = stretched_direction(local_grid)
+    else
+        tridiagonal_dim = tridiagonal_direction == XDirection() ? 1 : 
+                          tridiagonal_direction == YDirection() ? 2 : 3
+    end
+
+    topology(global_grid, tridiagonal_dim) != Bounded &&
+        error("`DistributedFourierTridiagonalPoissonSolver` requires that the stretched direction (dimension $tridiagonal_dim) is `Bounded`.")
+
+    FT         = Complex{eltype(local_grid)}
+    child_arch = child_architecture(local_grid)
+    storage    = TransposableField(CenterField(local_grid), FT)
+
+    topo = (TX, TY, TZ) = topology(global_grid)
+    λx = dropdims(poisson_eigenvalues(global_grid.Nx, global_grid.Lx, 1, TX()), dims=(2, 3))
+    λy = dropdims(poisson_eigenvalues(global_grid.Ny, global_grid.Ly, 2, TY()), dims=(1, 3))
+    λz = dropdims(poisson_eigenvalues(global_grid.Nz, global_grid.Lz, 3, TZ()), dims=(1, 2))
+        
+    if tridiagonal_dim == 1
+        arch = architecture(storage.xfield.grid)
+        grid = storage.xfield.grid
+        λ1 = partition_coordinate(λy, size(grid, 2), arch, 2)
+        λ2 = partition_coordinate(λz, size(grid, 3), arch, 3)
+    elseif tridiagonal_dim == 2
+        arch = architecture(storage.yfield.grid)
+        grid = storage.yfield.grid
+        λ1 = partition_coordinate(λx, size(grid, 1), arch, 1)
+        λ2 = partition_coordinate(λz, size(grid, 3), arch, 3)
+    elseif tridiagonal_dim == 3
+        arch = architecture(storage.zfield.grid)
+        grid = storage.zfield.grid
+        λ1 = partition_coordinate(λx, size(grid, 1), arch, 1)
+        λ2 = partition_coordinate(λy, size(grid, 2), arch, 2)
+    end
+
+    λ1 = on_architecture(child_arch, λ1)
+    λ2 = on_architecture(child_arch, λ2)
+
+    plan = plan_distributed_transforms(global_grid, storage, planner_flag)
+
+    # Lower and upper diagonals are the same
+    lower_diagonal = @allowscalar [ 1 / Δξᶠ(q, grid, Val(tridiagonal_dim)) for q in 2:size(grid, tridiagonal_dim) ]
+    lower_diagonal = on_architecture(child_arch, lower_diagonal)
+    upper_diagonal = lower_diagonal
+
+    # Compute diagonal coefficients for each grid point
+    diagonal = zeros(eltype(grid), size(grid)...)
+    diagonal = on_architecture(arch, diagonal)
+    launch_config = if tridiagonal_dim == 1
+                        :yz
+                    elseif tridiagonal_dim == 2
+                        :xz
+                    elseif tridiagonal_dim == 3
+                        :xy
+                    end
+
+    launch!(arch, grid, launch_config, compute_main_diagonal!, diagonal, grid, λ1, λ2, tridiagonal_direction)
+
+    # Set up batched tridiagonal solver
+    btsolver = BatchedTridiagonalSolver(grid; lower_diagonal, diagonal, upper_diagonal, tridiagonal_direction)
+    
+    # We need to permute indices to apply bounded transforms on the GPU (r2r of r2c with twiddling)
+    x_buffer_needed = child_arch isa GPU && TX == Bounded
+    z_buffer_needed = child_arch isa GPU && TZ == Bounded 
+    
+    # We cannot really batch anything, so on GPUs we always have to permute indices in the y direction
+    y_buffer_needed = child_arch isa GPU
+
+    buffer_x = x_buffer_needed ? on_architecture(child_arch, zeros(FT, size(storage.xfield)...)) : nothing
+    buffer_y = y_buffer_needed ? on_architecture(child_arch, zeros(FT, size(storage.yfield)...)) : nothing 
+    buffer_z = z_buffer_needed ? on_architecture(child_arch, zeros(FT, size(storage.zfield)...)) : nothing
+
+    buffer = if tridiagonal_dim == 1
+        (; y = buffer_y, z = buffer_z)
+    elseif tridiagonal_dim == 2
+        (; x = buffer_x, z = buffer_z)
+    elseif tridiagonal_dim == 3
+        (; x = buffer_x, y = buffer_y)
+    end
+
+    if tridiagonal_dim == 1
+        forward  = (y! = plan.forward.y!,  z! = plan.forward.z!)
+        backward = (y! = plan.backward.y!, z! = plan.backward.z!)
+    elseif tridiagonal_dim == 2
+        forward  = (x! = plan.forward.x!,  z! = plan.forward.z!)
+        backward = (x! = plan.backward.x!, z! = plan.backward.z!)
+    elseif tridiagonal_dim == 3
+        forward  = (x! = plan.forward.x!,  y! = plan.forward.y!)
+        backward = (x! = plan.backward.x!, y! = plan.backward.y!)
+    end
+
+    plan = (; forward, backward)
+
+    # Storage space for right hand side of Poisson equation
+    T = complex(eltype(grid))
+    source_term = zeros(T, size(grid)...)
+    source_term = on_architecture(arch, source_term)
+
+    return DistributedFourierTridiagonalPoissonSolver(plan, global_grid, local_grid, btsolver, source_term, storage, buffer)
+end
+
+# solve! requires that `b` in `A x = b` (the right hand side) 
+# is copied in the solver storage
+# See: Models/NonhydrostaticModels/solve_for_pressure.jl
+function solve!(x, solver::ZStretchedDistributedSolver)
+    arch    = architecture(solver)
+    storage = solver.storage
+    buffer  = solver.buffer
+
+    transpose_z_to_y!(storage) # copy data from storage.zfield to storage.yfield
+    solver.plan.forward.y!(parent(storage.yfield), buffer.y) 
+    transpose_y_to_x!(storage) # copy data from storage.yfield to storage.xfield
+    solver.plan.forward.x!(parent(storage.xfield), buffer.x)
+    transpose_x_to_y!(storage) # copy data from storage.xfield to storage.yfield
+    transpose_y_to_z!(storage) # copy data from storage.yfield to storage.zfield
+  
+    # copy results in the source term
+    parent(solver.source_term) .= parent(storage.zfield)
+
+    # Perform the implicit vertical solve here on storage.zfield...
+    # Solve tridiagonal system of linear equations at every z-column.
+    solve!(storage.zfield, solver.batched_tridiagonal_solver, solver.source_term)
+
+    transpose_z_to_y!(storage)
+    transpose_y_to_x!(storage) # copy data from storage.yfield to storage.xfield
+    solver.plan.backward.x!(parent(storage.xfield), buffer.x)
+    transpose_x_to_y!(storage) # copy data from storage.xfield to storage.yfield
+    solver.plan.backward.y!(parent(storage.yfield), buffer.y)
+    transpose_y_to_z!(storage) # copy data from storage.yfield to storage.zfield
+
+    # Copy the real component of xc to x.
+    launch!(arch, solver.local_grid, :xyz,
+            _copy_real_component!, x, parent(storage.zfield))
+
+    return x
+end
+
+function solve!(x, solver::YStretchedDistributedSolver)
+    arch    = architecture(solver)
+    storage = solver.storage
+    buffer  = solver.buffer
+
+    solver.plan.forward.z!(parent(storage.zfield), buffer.z)
+    transpose_z_to_y!(storage) # copy data from storage.zfield to storage.yfield
+    transpose_y_to_x!(storage) # copy data from storage.yfield to storage.xfield
+    solver.plan.forward.x!(parent(storage.xfield), buffer.x)
+    transpose_x_to_y!(storage) # copy data from storage.xfield to storage.yfield
+  
+    # copy results in the source term
+    parent(solver.source_term) .= parent(storage.yfield)
+
+    # Perform the implicit vertical solve here on storage.yfield...
+    # Solve tridiagonal system of linear equations at every y-column.
+    solve!(storage.yfield, solver.batched_tridiagonal_solver, solver.source_term)
+
+    transpose_y_to_x!(storage) # copy data from storage.yfield to storage.xfield
+    solver.plan.backward.x!(parent(storage.xfield), buffer.x)
+    transpose_x_to_y!(storage) # copy data from storage.xfield to storage.yfield
+    transpose_y_to_z!(storage) # copy data from storage.yfield to storage.zfield
+    solver.plan.backward.z!(parent(storage.zfield), buffer.z)
+
+    # Copy the real component of xc to x.
+    launch!(arch, solver.local_grid, :xyz,
+            _copy_real_component!, x, parent(storage.zfield))
+
+    return x
+end
+
+function solve!(x, solver::XStretchedDistributedSolver)
+    arch    = architecture(solver)
+    storage = solver.storage
+    buffer  = solver.buffer
+
+    # Apply forward transforms to b = first(solver.storage).
+    solver.plan.forward.z!(parent(storage.zfield), buffer.z)
+    transpose_z_to_y!(storage) # copy data from storage.zfield to storage.yfield
+    solver.plan.forward.y!(parent(storage.yfield), buffer.y) 
+    transpose_y_to_x!(storage) # copy data from storage.yfield to storage.xfield
+
+    # copy results in the source term
+    parent(solver.source_term) .= parent(storage.xfield)
+
+    # Perform the implicit vertical solve here on storage.xfield...
+    # Solve tridiagonal system of linear equations at every x-column.
+    solve!(storage.xfield, solver.batched_tridiagonal_solver, solver.source_term)
+
+    transpose_x_to_y!(storage) # copy data from storage.xfield to storage.yfield
+    solver.plan.backward.y!(parent(storage.yfield), buffer.y)
+    transpose_y_to_z!(storage) # copy data from storage.yfield to storage.zfield
+    solver.plan.backward.z!(parent(storage.zfield), buffer.z) # last backwards transform is in z
+
+    # Copy the real component of xc to x.
+    launch!(arch, solver.local_grid, :xyz,
+            _copy_real_component!, x, parent(storage.zfield))
+
+    return x
+end
\ No newline at end of file
diff --git a/src/DistributedComputations/distributed_grids.jl b/src/DistributedComputations/distributed_grids.jl
index 5f5dee5ef3..686f9c9541 100644
--- a/src/DistributedComputations/distributed_grids.jl
+++ b/src/DistributedComputations/distributed_grids.jl
@@ -84,7 +84,6 @@ function RectilinearGrid(arch::Distributed,
     xl = Rx == 1 ? x : partition_coordinate(x, nx, arch, 1)
     yl = Ry == 1 ? y : partition_coordinate(y, ny, arch, 2)
     zl = Rz == 1 ? z : partition_coordinate(z, nz, arch, 3)
-
     Lx, xᶠᵃᵃ, xᶜᵃᵃ, Δxᶠᵃᵃ, Δxᶜᵃᵃ = generate_coordinate(FT, topology[1](), nx, Hx, xl, :x, child_architecture(arch))
     Ly, yᵃᶠᵃ, yᵃᶜᵃ, Δyᵃᶠᵃ, Δyᵃᶜᵃ = generate_coordinate(FT, topology[2](), ny, Hy, yl, :y, child_architecture(arch))
     Lz, zᵃᵃᶠ, zᵃᵃᶜ, Δzᵃᵃᶠ, Δzᵃᵃᶜ = generate_coordinate(FT, topology[3](), nz, Hz, zl, :z, child_architecture(arch))
@@ -190,9 +189,9 @@ function reconstruct_global_grid(grid::DistributedRectilinearGrid)
     z = cpu_face_constructor_z(grid)
 
     ## This will not work with 3D parallelizations!!
-    xG = Rx == 1 ? x : assemble_coordinate(x, nx, Rx, ri, rj, rk, arch.communicator)
-    yG = Ry == 1 ? y : assemble_coordinate(y, ny, Ry, rj, ri, rk, arch.communicator)
-    zG = Rz == 1 ? z : assemble_coordinate(z, nz, Rz, rk, ri, rj, arch.communicator)
+    xG = Rx == 1 ? x : assemble_coordinate(x, nx, arch, 1)
+    yG = Ry == 1 ? y : assemble_coordinate(y, ny, arch, 2)
+    zG = Rz == 1 ? z : assemble_coordinate(z, nz, arch, 3)
 
     child_arch = child_architecture(arch)
 
@@ -233,9 +232,9 @@ function reconstruct_global_grid(grid::DistributedLatitudeLongitudeGrid)
     z = cpu_face_constructor_z(grid)
 
     ## This will not work with 3D parallelizations!!
-    λG = Rx == 1 ? λ : assemble_coordinate(λ, nλ, Rx, ri, rj, rk, arch.communicator)
-    φG = Ry == 1 ? φ : assemble_coordinate(φ, nφ, Ry, rj, ri, rk, arch.communicator)
-    zG = Rz == 1 ? z : assemble_coordinate(z, nz, Rz, rk, ri, rj, arch.communicator)
+    λG = Rx == 1 ? λ : assemble_coordinate(λ, nλ, arch, 1)
+    φG = Ry == 1 ? φ : assemble_coordinate(φ, nφ, arch, 2)
+    zG = Rz == 1 ? z : assemble_coordinate(z, nz, arch, 3)
 
     child_arch = child_architecture(arch)
 
diff --git a/src/DistributedComputations/partition_assemble.jl b/src/DistributedComputations/partition_assemble.jl
index 6502901f64..c56d0b2f22 100644
--- a/src/DistributedComputations/partition_assemble.jl
+++ b/src/DistributedComputations/partition_assemble.jl
@@ -43,11 +43,12 @@ function partition_coordinate(c::AbstractVector, n, arch, idx)
     r  = arch.local_index[idx]
 
     start_idx = sum(nl[1:r-1]) + 1 # sum of all previous rank's dimension + 1
-    end_idx   = if r == arch.ranks[idx]
-        length(c) 
+    end_idx   = if r == ranks(arch)[idx]
+        length(c)
     else
         sum(nl[1:r]) + 1 
     end
+
     return c[start_idx : end_idx]
 end
 
@@ -74,32 +75,40 @@ a local number of elements `Nc`, number of ranks `Nr`, rank `r`,
 and `arch`itecture. Since we use a global reduction, only ranks at positions
 1 in the other two directions `r1 == 1` and `r2 == 1` fill the 1D array.
 """
-function assemble_coordinate(c_local::AbstractVector, n, R, r, r1, r2, comm) 
-    nl = concatenate_local_sizes(n, R, r)
+function assemble_coordinate(c_local::AbstractVector, n, arch, idx) 
+    nl = concatenate_local_sizes(n, arch, idx)
+    R  = arch.ranks[idx]
+    r  = arch.local_index[idx]
+    r2 = [arch.local_index[i] for i in filter(x -> x != idx, (1, 2, 3))]
 
     c_global = zeros(eltype(c_local), sum(nl)+1)
 
-    if r1 == 1 && r2 == 1
+    if r2[1] == 1 && r2[2] == 1
         c_global[1 + sum(nl[1:r-1]) : sum(nl[1:r])] .= c_local[1:end-1]
-        r == Nr && (c_global[end] = c_local[end])
+        r == R && (c_global[end] = c_local[end])
     end
 
-    MPI.Allreduce!(c_global, +, comm)
+    MPI.Allreduce!(c_global, +, arch.communicator)
 
     return c_global
 end
 
 # Simple case, just take the first and the last core
-function assemble_coordinate(c::Tuple, n, R, r, r1, r2, comm) 
+function assemble_coordinate(c_local::Tuple, n, arch, idx) 
     c_global = zeros(Float64, 2)
-
-    if r == 1 && r1 == 1 && r2 == 1
-        c_global[1] = c[1]
-    elseif r == R && r1 == 1 && r2 == 1
-        c_global[2] = c[2]
+    
+    rank = arch.local_index
+    R    = arch.ranks[idx]
+    r    = rank[idx]
+    r2   = [rank[i] for i in filter(x -> x != idx, (1, 2, 3))]
+
+    if rank[1] == 1 && rank[2] == 1 && rank[3] == 1
+        c_global[1] = c_local[1]
+    elseif r == R && r2[1] == 1 && r2[1] == 1
+        c_global[2] = c_local[2]
     end
 
-    MPI.Allreduce!(c_global, +, comm)
+    MPI.Allreduce!(c_global, +, arch.communicator)
 
     return tuple(c_global...)
 end 
diff --git a/src/DistributedComputations/transposable_field.jl b/src/DistributedComputations/transposable_field.jl
index 183846b798..be630cc46a 100644
--- a/src/DistributedComputations/transposable_field.jl
+++ b/src/DistributedComputations/transposable_field.jl
@@ -138,9 +138,9 @@ function twin_grid(grid::DistributedGrid; local_direction = :y)
     y = cpu_face_constructor_y(grid)
     z = cpu_face_constructor_z(grid)
 
-    xG = R[1] == 1 ? x : assemble_coordinate(x, nx, R[1], ri, rj, rk, arch.communicator)
-    yG = R[2] == 1 ? y : assemble_coordinate(y, ny, R[2], rj, ri, rk, arch.communicator)
-    zG = R[3] == 1 ? z : assemble_coordinate(z, nz, R[3], rk, ri, rj, arch.communicator)
+    xG = R[1] == 1 ? x : assemble_coordinate(x, nx, arch, 1)
+    yG = R[2] == 1 ? y : assemble_coordinate(y, ny, arch, 2)
+    zG = R[3] == 1 ? z : assemble_coordinate(z, nz, arch, 3)
 
     child_arch = child_architecture(arch)
 
diff --git a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
index 5fccd29f82..57be90b7f0 100644
--- a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
+++ b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
@@ -10,7 +10,9 @@ using Oceananigans.Utils
 using Oceananigans.Grids
 using Oceananigans.Solvers
 
-using Oceananigans.DistributedComputations: Distributed, DistributedFFTBasedPoissonSolver, reconstruct_global_grid   
+using Oceananigans.DistributedComputations
+using Oceananigans.DistributedComputations: reconstruct_global_grid, Distributed
+using Oceananigans.DistributedComputations: DistributedFFTBasedPoissonSolver, DistributedFourierTridiagonalPoissonSolver
 using Oceananigans.Grids: XYRegularRG, XZRegularRG, YZRegularRG, XYZRegularRG
 using Oceananigans.ImmersedBoundaries: ImmersedBoundaryGrid
 using Oceananigans.Utils: SumOfArrays
@@ -19,11 +21,26 @@ import Oceananigans: fields, prognostic_fields
 import Oceananigans.Advection: cell_advection_timescale
 import Oceananigans.TimeSteppers: step_lagrangian_particles!
 
-function PressureSolver(arch::Distributed, local_grid::XYZRegularRG)
+function PressureSolver(::Distributed, local_grid::XYZRegularRG)
     global_grid = reconstruct_global_grid(local_grid)
     return DistributedFFTBasedPoissonSolver(global_grid, local_grid)
 end
 
+function PressureSolver(::Distributed, local_grid::XYRegularRG)
+    global_grid = reconstruct_global_grid(local_grid)
+    return DistributedFourierTridiagonalPoissonSolver(global_grid, local_grid)
+end
+
+function PressureSolver(::Distributed, local_grid::YZRegularRG)
+    global_grid = reconstruct_global_grid(local_grid)
+    return DistributedFourierTridiagonalPoissonSolver(global_grid, local_grid)
+end
+
+function PressureSolver(::Distributed, local_grid::XZRegularRG)
+    global_grid = reconstruct_global_grid(local_grid)
+    return DistributedFourierTridiagonalPoissonSolver(global_grid, local_grid)
+end
+
 PressureSolver(arch, grid::XYZRegularRG) = FFTBasedPoissonSolver(grid)
 PressureSolver(arch, grid::XYRegularRG)  = FourierTridiagonalPoissonSolver(grid)
 PressureSolver(arch, grid::XZRegularRG)  = FourierTridiagonalPoissonSolver(grid)
diff --git a/src/Models/NonhydrostaticModels/solve_for_pressure.jl b/src/Models/NonhydrostaticModels/solve_for_pressure.jl
index 10154fc91e..5064a2c5d6 100644
--- a/src/Models/NonhydrostaticModels/solve_for_pressure.jl
+++ b/src/Models/NonhydrostaticModels/solve_for_pressure.jl
@@ -61,6 +61,22 @@ function solve_for_pressure!(pressure, solver::FFTBasedPoissonSolver, Δt, U★)
     return nothing
 end
 
+function solve_for_pressure!(pressure, solver::DistributedFourierTridiagonalPoissonSolver, Δt, U★)
+
+    # Calculate right hand side:
+    rhs = solver.storage.zfield
+    arch = architecture(solver)
+    grid = solver.local_grid
+
+    launch!(arch, grid, :xyz, calculate_pressure_source_term_fourier_tridiagonal_solver!,
+            rhs, grid, Δt, U★, solver.batched_tridiagonal_solver.tridiagonal_direction)
+
+    # Pressure Poisson rhs, scaled by the spacing in the stretched direction at ᶜᶜᶜ, is stored in solver.source_term:
+    solve!(pressure, solver)
+
+    return nothing
+end
+
 function solve_for_pressure!(pressure, solver::FourierTridiagonalPoissonSolver, Δt, U★)
 
     # Calculate right hand side:
diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl
index 6316fc15d2..2886847938 100644
--- a/src/Solvers/batched_tridiagonal_solver.jl
+++ b/src/Solvers/batched_tridiagonal_solver.jl
@@ -18,8 +18,12 @@ struct BatchedTridiagonalSolver{A, B, C, T, G, P, D}
     tridiagonal_direction :: D
 end
 
-architecture(solver::BatchedTridiagonalSolver) = architecture(solver.grid)
+# Some aliases...
+const XTridiagonalSolver = BatchedTridiagonalSolver{A, B, C, T, G, P, <:XDirection} where {A, B, C, T, G, P}
+const YTridiagonalSolver = BatchedTridiagonalSolver{A, B, C, T, G, P, <:YDirection} where {A, B, C, T, G, P}
+const ZTridiagonalSolver = BatchedTridiagonalSolver{A, B, C, T, G, P, <:ZDirection} where {A, B, C, T, G, P}
 
+architecture(solver::BatchedTridiagonalSolver) = architecture(solver.grid)
 
 """
     BatchedTridiagonalSolver(grid;
diff --git a/test/test_distributed_poisson_solvers.jl b/test/test_distributed_poisson_solvers.jl
index c5613ae9a7..dca406d8c3 100644
--- a/test/test_distributed_poisson_solvers.jl
+++ b/test/test_distributed_poisson_solvers.jl
@@ -30,7 +30,7 @@ include("dependencies_for_poisson_solvers.jl")
 
 # to initialize MPI.
 
-using Oceananigans.DistributedComputations: reconstruct_global_grid, DistributedGrid, Partition
+using Oceananigans.DistributedComputations: reconstruct_global_grid, DistributedGrid, Partition, DistributedFourierTridiagonalPoissonSolver
 using Oceananigans.Models.NonhydrostaticModels: solve_for_pressure!
 
 function random_divergent_source_term(grid::DistributedGrid)
@@ -88,6 +88,39 @@ function divergence_free_poisson_solution(grid_points, ranks, topo, child_arch)
     return Array(interior(∇²ϕ)) ≈ Array(R)
 end
 
+function divergence_free_poisson_tridiagonal_solution(grid_points, ranks, stretched_direction, child_arch)
+    arch = Distributed(child_arch, partition=Partition(ranks...))
+    
+    if stretched_direction == :x
+        x = collect(range(0, 2π, length = grid_points[1]+1))
+        y = z = (0, 2π)
+    elseif stretched_direction == :y
+        y = collect(range(0, 2π, length = grid_points[2]+1))
+        x = z = (0, 2π)
+    elseif stretched_direction == :z
+        z = collect(range(0, 2π, length = grid_points[3]+1))
+        x = y = (0, 2π)
+    end
+        
+    local_grid = RectilinearGrid(arch; topology=(Bounded, Bounded, Bounded), size=grid_points, x, y, z)
+
+    # The test will solve for ϕ, then compare R to ∇²ϕ.
+    ϕ   = CenterField(local_grid)
+    ∇²ϕ = CenterField(local_grid)
+    R, U = random_divergent_source_term(local_grid)
+
+    global_grid = reconstruct_global_grid(local_grid)
+    solver = DistributedFourierTridiagonalPoissonSolver(global_grid, local_grid)
+    
+    # Using Δt = 1.
+    solve_for_pressure!(ϕ, solver, 1, U)
+
+    # "Recompute" ∇²ϕ
+    compute_∇²!(∇²ϕ, ϕ, arch, local_grid)
+
+    return Array(interior(∇²ϕ)) ≈ Array(R)
+end
+
 @testset "Distributed FFT-based Poisson solver" begin
     child_arch = test_child_arch()
     
@@ -97,16 +130,30 @@ end
                      (Bounded, Bounded, Bounded))
                     
         @info "  Testing 3D distributed FFT-based Poisson solver with topology $topology..."
-        @show @test divergence_free_poisson_solution((44, 44, 8), (4, 1, 1), topology, child_arch)
-        @show @test divergence_free_poisson_solution((16, 44, 8), (4, 1, 1), topology, child_arch)
-        @show @test divergence_free_poisson_solution((44, 44, 8), (1, 4, 1), topology, child_arch)
-        @show @test divergence_free_poisson_solution((44, 16, 8), (1, 4, 1), topology, child_arch)
-        @show @test divergence_free_poisson_solution((16, 44, 8), (1, 4, 1), topology, child_arch)
-        @show @test divergence_free_poisson_solution((22, 44, 8), (2, 2, 1), topology, child_arch)
-        @show @test divergence_free_poisson_solution((44, 22, 8), (2, 2, 1), topology, child_arch)
+        @test divergence_free_poisson_solution((44, 44, 8), (4, 1, 1), topology, child_arch)
+        @test divergence_free_poisson_solution((16, 44, 8), (4, 1, 1), topology, child_arch)
+        @test divergence_free_poisson_solution((44, 44, 8), (1, 4, 1), topology, child_arch)
+        @test divergence_free_poisson_solution((44, 16, 8), (1, 4, 1), topology, child_arch)
+        @test divergence_free_poisson_solution((16, 44, 8), (1, 4, 1), topology, child_arch)
+        @test divergence_free_poisson_solution((22, 44, 8), (2, 2, 1), topology, child_arch)
+        @test divergence_free_poisson_solution((44, 22, 8), (2, 2, 1), topology, child_arch)
 
         @info "  Testing 2D distributed FFT-based Poisson solver with topology $topology..."
-        @show @test divergence_free_poisson_solution((44, 16, 1), (4, 1, 1), topology, child_arch)
-        @show @test divergence_free_poisson_solution((16, 44, 1), (4, 1, 1), topology, child_arch)
+        @test divergence_free_poisson_solution((44, 16, 1), (4, 1, 1), topology, child_arch)
+        @test divergence_free_poisson_solution((16, 44, 1), (4, 1, 1), topology, child_arch)
+    end
+
+    for stretched_direction in (:x, :y, :z)
+        @info "  Testing 3D distributed Fourier Tridiagonal Poisson solver stretched in $stretched_direction"
+        @test divergence_free_poisson_tridiagonal_solution((44, 44, 8), (1, 4, 1), stretched_direction, child_arch)
+        @test divergence_free_poisson_tridiagonal_solution((44,  4, 8), (1, 4, 1), stretched_direction, child_arch)
+        @test divergence_free_poisson_tridiagonal_solution((16, 44, 8), (1, 4, 1), stretched_direction, child_arch)
+        @test divergence_free_poisson_tridiagonal_solution((22,  8, 8), (2, 2, 1), stretched_direction, child_arch)
+        @test divergence_free_poisson_tridiagonal_solution(( 8, 22, 8), (2, 2, 1), stretched_direction, child_arch)
+        @test divergence_free_poisson_tridiagonal_solution((44, 44, 8), (1, 4, 1), stretched_direction, child_arch)
+        @test divergence_free_poisson_tridiagonal_solution((44,  4, 8), (1, 4, 1), stretched_direction, child_arch)
+        @test divergence_free_poisson_tridiagonal_solution((16, 44, 8), (1, 4, 1), stretched_direction, child_arch)
+        @test divergence_free_poisson_tridiagonal_solution((22,  8, 8), (2, 2, 1), stretched_direction, child_arch)
+        @test divergence_free_poisson_tridiagonal_solution(( 8, 22, 8), (2, 2, 1), stretched_direction, child_arch)
     end
 end
diff --git a/test/test_nonhydrostatic_regression.jl b/test/test_nonhydrostatic_regression.jl
index deb9c5d72c..2d50afdfe2 100644
--- a/test/test_nonhydrostatic_regression.jl
+++ b/test/test_nonhydrostatic_regression.jl
@@ -53,11 +53,9 @@ include("regression_tests/ocean_large_eddy_simulation_regression_test.jl")
         A = typeof(arch)
 
         for grid_type in [:regular, :vertically_unstretched]
-            if !(arch isa Distributed && grid_type == :vertically_unstretched)
-                @testset "Rayleigh–Bénard tracer [$A, $grid_type grid]]" begin
-                    @info "  Testing Rayleigh–Bénard tracer regression [$A, $grid_type grid]"
-                    run_rayleigh_benard_regression_test(arch, grid_type)
-                end
+            @testset "Rayleigh–Bénard tracer [$A, $grid_type grid]]" begin
+                @info "  Testing Rayleigh–Bénard tracer regression [$A, $grid_type grid]"
+                run_rayleigh_benard_regression_test(arch, grid_type)
             end
 
             if !(arch isa Distributed)