Skip to content

Commit

Permalink
New packunpack transpose
Browse files Browse the repository at this point in the history
  • Loading branch information
davecats committed Jul 11, 2023
1 parent dcc9c3b commit efd91f1
Show file tree
Hide file tree
Showing 13 changed files with 220 additions and 444 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ export CNFGSTRNG
config_file = compiler.settings
-include ${config_file}

OBJ = typedef.o rbmat.o mpi_transpose.o ffts.o dnsdata.o runtime.o
OBJ = typedef.o rbmat.o mpi_transpose.o ffts.o dnsdata.o
flags = -I$(FFTW_INC) -L$(FFTW_LIB) $(FLAGS)
libs = -lfftw3

Expand Down
17 changes: 11 additions & 6 deletions dnsdata.f90
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ MODULE dnsdata

!Simulation parameters
real(C_DOUBLE) :: PI=3.1415926535897932384626433832795028841971
integer(C_INT) :: nx,ny,nz,nxd,nzd
integer(C_INT) :: ny,nz,nxd
real(C_DOUBLE) :: alfa0,beta0,ni,a,ymin,ymax,deltat,cflmax,time,time0=0,dt_field,dt_save,t_max,gamma
real(C_DOUBLE) :: u0,uN
logical :: CPI
Expand Down Expand Up @@ -509,7 +509,8 @@ SUBROUTINE convolutions(iy,i,compute_cfl,in_timeloop,ODE)
DO iV=1,3
CALL IFT(VVdz(1:nzd,1:nxB,iV,i))
#ifdef nonblockingXZ
CALL MPI_IAlltoall(VVdz(:,:,iV,i), 1, Mdz, VVdx(:,:,iV,i), 1, Mdx, MPI_COMM_X, Rs(iV))
CALL zTOx(VVdz(:,:,iV,i),VVdx(:,:,iV,i),Rs)
!MPI_IAlltoall(VVdz(:,:,iV,i), 1, Mdz, VVdx(:,:,iV,i), 1, Mdx, MPI_COMM_X, Rs(iV))
#endif
#ifdef convvel
IF (compute_convvel) THEN
Expand All @@ -529,7 +530,8 @@ SUBROUTINE convolutions(iy,i,compute_cfl,in_timeloop,ODE)
END IF
#endif
#ifndef nonblockingXZ
CALL MPI_Alltoall(VVdz(:,:,iV,i), 1, Mdz, VVdx(:,:,iV,i), 1, Mdx, MPI_COMM_X)
CALL zTOx(VVdz(:,:,iV,i),VVdx(:,:,iV,i))
!CALL MPI_Alltoall(VVdz(:,:,iV,i), 1, Mdz, VVdx(:,:,iV,i), 1, Mdx, MPI_COMM_X)
VVdx(nx+2:nxd+1,1:nzB,iV,i)=0; CALL RFT(VVdx(1:nxd+1,1:nzB,iV,i),rVVdx(1:2*nxd+2,1:nzB,iV,i))
#endif
END DO
Expand Down Expand Up @@ -568,7 +570,8 @@ SUBROUTINE convolutions(iy,i,compute_cfl,in_timeloop,ODE)
END DO
DO iV=1,3
CALL HFT(rFdx(1:2*nxd+2,1:nzB,iV,1), Fdx(1:nxd+1,1:nzB,iV,1))
CALL MPI_Alltoall(Fdx(:,:,iV,1), 1, Mdx, Fdz(:,:,iV,1), 1, Mdz, MPI_COMM_X)
CALL xTOz(Fdx(:,:,iV,1),Fdz(:,:,iV,1))
!CALL MPI_Alltoall(Fdx(:,:,iV,1), 1, Mdx, Fdz(:,:,iV,1), 1, Mdz, MPI_COMM_X)
CALL FFT(Fdz(1:nzd,1:nxB,iV,1))
END DO
F(iy,0:nz,nx0:nxN,1:3)=Fdz(1:nz+1,1:nxB,1:3,1)*factor
Expand All @@ -582,10 +585,12 @@ SUBROUTINE convolutions(iy,i,compute_cfl,in_timeloop,ODE)
DO iV=1,6
CALL HFT(rVVdx(1:2*nxd+2,1:nzB,iV,i),VVdx(1:nxd+1,1:nzB,iV,i));
#ifndef nonblockingXZ
CALL MPI_Alltoall(VVdx(:,:,iV,i), 1, Mdx, VVdz(:,:,iV,i), 1, Mdz, MPI_COMM_X)
CALL xTOz(VVdx(:,:,iV,i), VVdz(:,:,iV,i))
!CALL MPI_Alltoall(VVdx(:,:,iV,i), 1, Mdx, VVdz(:,:,iV,i), 1, Mdz, MPI_COMM_X)
CALL FFT(VVdz(1:nzd,1:nxB,iV,i));
#else
CALL MPI_IAlltoall(VVdx(:,:,iV,i), 1, Mdx, VVdz(:,:,iV,i), 1, Mdz, MPI_COMM_X, Rs(iV))
CALL xTOz(VVdx(:,:,iV,i), VVdz(:,:,iV,i), Rs(iV))
!CALL MPI_IAlltoall(VVdx(:,:,iV,i), 1, Mdx, VVdz(:,:,iV,i), 1, Mdz, MPI_COMM_X, Rs(iV))
#endif
END DO
#ifdef nonblockingXZ
Expand Down
42 changes: 27 additions & 15 deletions header.h
Original file line number Diff line number Diff line change
@@ -1,49 +1,61 @@
! ==============================================
! C preprocessor defines
! ==============================================
! Which mpi transpose do you want?
!#define packunpack

! Stop program when warnings or interactive prompts
! are met; suggested for execution on cluster.
#define warnings_are_fatal ! <-----------------------------------------------------------------------------------------
!#define warnings_are_fatal ! <-----------------------------------------------------------------------------------------

! Use nonblocking communication in XZ directions
#define nonblockingXZ
!#define nonblockingXZ

! Use nonblocking communication in Y direction
! only if the program is NOT uiuj_*
! (uiuj does not support nonblocking comm in Y)
#ifndef forceblockingY
#define nonblockingY
#endif
! Force (nxd,nzd) to be at most the product of a
!#ifndef forceblockingY
!#define nonblockingY
!#endif

! Force (nxd,nzd) to be at most the product of a
! power of 2 and a single factor 3
#define useFFTfit

! half or full channel
!#define halfchannel

! Add a bodyforce
!#define bodyforce
#define BODYFORCE_HEADER "body_forces/am_f1/am_pardec.inc"
#define BODYFORCE_MODULES "body_forces/am_f1/am_f1.inc"
!#define BODYFORCE_HEADER "body_forces/am_f1/am_pardec.inc"
!#define BODYFORCE_MODULES "body_forces/am_f1/am_f1.inc"

! define a bodyforce in space (ibm)
!#define ibm

! Measure per timestep execution time
#define chron

! Distrubute processes among processor so that
! the y-dierction is contiguous
#define ycontiguous

! Compute convection velocity
!#define convvel

! Verbose echo of parallel parameters
#define mpiverbose

! Disable code optimisation where possible (FFTW),
! in order to get code whose behaviour can be replicated
! exactly with different parallelisations. Useful for
! testing.
!#define no_optimising_code
! Files and settings for runtime calculation and disk dump of statistics
#define runtime_avoid_savefld
#define HEADER_RUNTIME "runtime_plugin/instabudget/header.inc"
#define RUNTIME_SETUP_SUBROUTINE "runtime_plugin/instabudget/setup.inc"
#define RUNTIME_FINALISE_SUBROUTINE "runtime_plugin/instabudget/finalise.inc"
#define RUNTIME_SAVE_SUBROUTINE "runtime_plugin/instabudget/save.inc"
#define RUNTIME_AUXILIARY_SUBROUTINES "runtime_plugin/instabudget/aux.inc"

! This avoids to save Dati.cart.out every dt_field during
! the simulation. The file is saved only at the end of
! runtime
!#define runtime_avoid_savefld


! ==============================================
Expand Down
177 changes: 164 additions & 13 deletions mpi_transpose.f90
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,21 @@
MODULE mpi_transpose

USE, intrinsic :: iso_c_binding
USE, intrinsic :: iso_fortran_env
USE typedef
USE mpi_f08

TYPE(MPI_Comm) :: MPI_COMM_X, MPI_COMM_Y, MPI_COMM_NEXT, MPI_COMM_PREV
integer(C_INT),save :: nproc,iproc,ierr,npx,ipx,npy,ipy
integer(C_INT), save :: nx0,nxN,nxB,nz0,nzN,nzB,block
TYPE(MPI_Comm) :: MPI_CART_COMM, MPI_COMM_X, MPI_COMM_Y
integer(C_INT), save :: nproc,iproc,ierr,npx,ipx,npy,ipy,nzd,nx
integer(C_INT), save :: nx0,nxN,nxB,nz0,nzN,nzB,block,nxpp
integer(C_INT), save :: ny0,nyN,miny,maxy
integer(C_INT), save :: TAG_LUDECOMP=100, TAG_LUDIVSTEP1=101, TAG_LUDIVSTEP2=102, TAG_DUDY=103
complex(C_DOUBLE_COMPLEX), allocatable :: Ain(:),Aout(:)
logical, save :: first,last,has_terminal,has_average
TYPE(MPI_Datatype), save :: Mdz,Mdx,cmpl,writeview_type,owned2write_type,vel_read_type,vel_field_type
TYPE(MPI_Datatype), save :: writeview_type,owned2write_type,vel_read_type,vel_field_type
#ifndef packunpack
TYPE(MPI_Datatype), save :: Mdz,Mdx,cmpl
#endif
#ifdef nonblockingY
! Array of requests for nonblocking communication in linsolve
TYPE(MPI_REQUEST), allocatable :: REQlinSolve(:), REQvetaTOuvw(:)
Expand All @@ -38,14 +43,147 @@ MODULE mpi_transpose

CONTAINS

! First we get the pack/unpack version of the transpose
! in blocking and nonblocking version
#ifdef packunpack
#ifndef nonblockingXZ
!-------------- Transpose: Z to X --------------!
!-----------------------------------------------!
SUBROUTINE zTOx(Vz, Vx)
complex(C_DOUBLE_COMPLEX), intent(in) :: Vz(1:,1:)
complex(C_DOUBLE_COMPLEX), intent(out) :: Vx(1:,1:)
integer(C_SIZE_T) :: i,j
integer(C_SIZE_T) :: ix,iz,one,num,zero,npxt
integer(C_INT) :: ierr

one = 1
num = nzd/npx
zero = 0
npxt = npx

Ain=0; Aout=0

!Pack
i=zero
DO j=zero,npxt-one
DO ix=nx0,nxN
Ain(i:i+num-one)=Vz(one+j*num:(j+one)*num,ix-nx0+one)
i=i+num
END DO
END DO

!Send
CALL MPI_Alltoall(Ain, nxB*nzB, MPI_DOUBLE_COMPLEX, Aout, nxB*nzB, MPI_DOUBLE_COMPLEX, MPI_COMM_X)

!num = (nx+1)/npx
!Unpack
i=zero
DO ix=0,nx
Vx(ix+one,one:num)=Aout(i:i+num-one)
i=i+num
END DO
END SUBROUTINE zTOx


!-------------- Transpose: X to Z --------------!
!-----------------------------------------------!
SUBROUTINE xTOz(Vx,Vz)
complex(C_DOUBLE_COMPLEX), intent(out) :: Vz(1:,1:)
complex(C_DOUBLE_COMPLEX), intent(in) :: Vx(1:,1:)
integer(C_SIZE_T) :: i,j
integer(C_SIZE_T) :: ix,iz,one,num,zero,npxt

one = 1
num = (nx+1)/npx
zero = 0
npxt = npx

!Pack
i=zero
DO j=zero,npxt-one
DO iz=nz0,nzN
Ain(i:i+num-one)=Vx(one+j*num:(j+one)*num,iz-nz0+one)
i=i+num
END DO
END DO

!Send
CALL MPI_Alltoall(Ain, nxB*nzB, MPI_DOUBLE_COMPLEX, Aout, nxB*nzB, MPI_DOUBLE_COMPLEX, MPI_COMM_X)

!Unpack
i=zero
DO iz=0,nzd-1
Vz(iz+one,one:num)=Aout(i:i+num-one)
i=i+num
END DO
END SUBROUTINE xTOz
#else
#error "No nonblockingXZ version of the pack/unpack transform yet!"
#endif

#else
! Then we get the datatype version of the transpose
! in blocking and nonblocking version
#ifndef nonblockingXZ
!-------------- Transpose: Z to X --------------!
!-----------------------------------------------!
SUBROUTINE zTOx(Vz, Vx)
complex(C_DOUBLE_COMPLEX), intent(in) :: Vz(1:,1:)
complex(C_DOUBLE_COMPLEX), intent(out) :: Vx(1:,1:)

CALL MPI_Alltoall(Vz(:,:), 1, Mdz, Vx(:,:), 1, Mdx, MPI_COMM_X)

END SUBROUTINE zTOx


!-------------- Transpose: X to Z --------------!
!-----------------------------------------------!
SUBROUTINE xTOz(Vx,Vz)
complex(C_DOUBLE_COMPLEX), intent(out) :: Vz(1:,1:)
complex(C_DOUBLE_COMPLEX), intent(in) :: Vx(1:,1:)

CALL MPI_Alltoall(Vx(:,:), 1, Mdx, Vz(:,:), 1, Mdz, MPI_COMM_X)

END SUBROUTINE xTOz
#else
!-------------- Transpose: Z to X --------------!
!-----------------------------------------------!
SUBROUTINE zTOx(Vz, Vx)
complex(C_DOUBLE_COMPLEX), intent(in) :: Vz(1:,1:)
complex(C_DOUBLE_COMPLEX), intent(out) :: Vx(1:,1:)
type(MPI_REQUEST), intent(inout) :: Rs

CALL MPI_IAlltoall(Vz(:,:), 1, Mdz, Vx(:,:), 1, Mdx, MPI_COMM_X, Rs)

END SUBROUTINE zTOx


!-------------- Transpose: X to Z --------------!
!-----------------------------------------------!
SUBROUTINE xTOz(Vx,Vz)
complex(C_DOUBLE_COMPLEX), intent(out) :: Vz(1:,1:)
complex(C_DOUBLE_COMPLEX), intent(in) :: Vx(1:,1:)
type(MPI_REQUEST), intent(inout) :: Rs

CALL MPI_IAlltoall(Vx(:,:), 1, Mdx, Vz(:,:), 1, Mdz, MPI_COMM_X, Rs)

END SUBROUTINE xTOz
#endif
#endif


!------- Divide the problem in 1D slices -------!
!-----------------------------------------------!
SUBROUTINE init_MPI(nxpp,nz,ny,nxd,nzd)
integer(C_INT), intent(in) :: nxpp,nz,ny,nxd,nzd
TYPE(MPI_Datatype) :: row,column,tmp
integer(kind=MPI_ADDRESS_KIND) :: stride,lb
SUBROUTINE init_MPI(nx,nz,ny,nxd,nzd)
integer(C_INT), intent(in) :: nx,nz,ny,nxd,nzd
integer, parameter :: ndims = 4
integer :: array_of_sizes(ndims), array_of_subsizes(ndims), array_of_starts(ndims), ierror
integer(C_INT) :: i,j,dims(1:2),coords(1:2)
logical :: periods(1:2)=.false.,reorder=.true.
TYPE(MPI_Datatype), save :: row, column, tmp
integer(kind=MPI_ADDRESS_KIND) :: stride,lb
integer(C_INT), allocatable, dimension(:) :: rankW, rankX, rankY
nxpp=nx+1
! Define which process write on screen
has_terminal=(iproc==0)
#ifdef nonblockingY
Expand All @@ -56,11 +194,14 @@ SUBROUTINE init_MPI(nxpp,nz,ny,nxd,nzd)
#endif
! Processor decomposition
#ifdef ycontiguous
npx=nproc/npy; ipx=iproc/npy;
CALL MPI_Comm_split(MPI_COMM_WORLD, ipx, iproc, MPI_COMM_Y) ! Communicator with processes holding same X-slab
CALL MPI_Comm_rank(MPI_COMM_Y, ipy)
npx=nproc/npy;
dims(1)=npx; dims(2)=npy
CALL MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, reorder, MPI_CART_COMM)
CALL MPi_Cart_coords(MPI_CART_COMM, iproc, 2, coords)
ipx=coords(1); ipy=coords(2)
first=(ipy==0); last=(ipy==(npy-1))
CALL MPI_Comm_split(MPI_COMM_WORLD, ipy, iproc, MPI_COMM_X) ! Communicator with processes holding same Y-slab
CALL MPI_Comm_split(MPI_CART_COMM, ipx, iproc, MPI_COMM_Y) ! Communicator with processes holding same X-slab
CALL MPI_Comm_split(MPI_CART_COMM, ipy, iproc, MPI_COMM_X) ! Communicator with processes holding same Y-slab
#else
npx=nproc/npy; ipy=iproc/npx; first=(ipy==0); last=(ipy==(npy-1));
CALL MPI_Comm_split(MPI_COMM_WORLD, ipy, iproc, MPI_COMM_X) ! Communicator with processes holding same Y-slab
Expand All @@ -79,8 +220,17 @@ SUBROUTINE init_MPI(nxpp,nz,ny,nxd,nzd)
has_average=(nx0==0)
! Communicators only with previous and next in Y
#ifdef mpiverbose
WRITE(*,*) "iproc=",iproc,"ipx=",ipx,"ipy=",ipy, "nx0=",nx0," nxN=",nxN," nxB=",nxB, "nz0=",nz0," nzN=",nzN," nzB=",nzB, "ny0=", ny0, "nyN=", nyN
DO i=0,nproc-1
IF (iproc==i) WRITE(*,*) "iproc=",iproc,"ipx=",ipx,"ipy=",ipy, "nx0=",nx0," nxN=",nxN," nxB=",nxB, "nz0=",nz0," nzN=",nzN," nzB=",nzB, "ny0=", ny0, "nyN=", nyN
CALL MPI_Barrier(MPI_COMM_WORLD)
END DO
FLUSH(output_unit)
#endif
#ifdef packunpack
! Allocate buffers for transposes
ALLOCATE(Ain(0:block-1)); Ain=0
ALLOCATE(Aout(0:block-1)); Aout=0
#else
! MPI derived datatyped - basics
CALL MPI_Type_contiguous(2,MPI_DOUBLE_PRECISION,cmpl) !complex
CALL MPI_Type_commit(cmpl)
Expand All @@ -92,6 +242,7 @@ SUBROUTINE init_MPI(nxpp,nz,ny,nxd,nzd)
lb=0; stride=8*2; CALL MPI_Type_create_resized(column,lb,stride,tmp)
CALL MPI_Type_contiguous(nxB,tmp,Mdx)
CALL MPI_Type_commit(Mdx)
#endif
! For READING VELOCITY, SETTING VIEW: datatype that maps velocity on disk to memory (it differs from writing: halo cells are read twice!)
CALL MPI_Type_create_subarray(ndims, [ny+3, 2*nz+1, nxpp, 3], [nyN-ny0+5, 2*nz+1, nxB, 3], [ny0-1,0,nx0,0], MPI_ORDER_FORTRAN, MPI_DOUBLE_COMPLEX, vel_read_type, ierror)
CALL MPI_Type_commit(vel_read_type, ierror)
Expand Down
5 changes: 3 additions & 2 deletions postpro/am/camstar.f90
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,8 @@ subroutine scale_filter(iy1, iy2)
do icnt=1,2
do iV=1,4
call IFT(VVdz(:,:,iV,icnt)) ! first transform in z
call MPI_Alltoall(VVdz(:,:,iv,icnt), 1, Mdz, VVdx(:,:,iv,icnt), 1, Mdx, MPI_COMM_X)
call zTOx(VVdz(:,:,iv,icnt),VVdx(:,:,iv,icnt))
!call MPI_Alltoall(VVdz(:,:,iv,icnt), 1, Mdz, VVdx(:,:,iv,icnt), 1, Mdx, MPI_COMM_X)
VVdx(nx+2:nxd+1,1:nzB,iV,icnt)=0
call RFT(VVdx(:,:,iV,icnt),rVVdx(:,:,iV,icnt)) ! second transform in x
end do
Expand Down Expand Up @@ -596,4 +597,4 @@ subroutine watermark()



end program
end program
Loading

0 comments on commit efd91f1

Please sign in to comment.