New packunpack transpose

davecats · Jul 11, 2023 · efd91f1 · efd91f1
1 parent dcc9c3b
commit efd91f1
Show file tree

Hide file tree

Showing 13 changed files with 220 additions and 444 deletions.
diff --git a/Makefile b/Makefile
@@ -37,7 +37,7 @@ export CNFGSTRNG
 config_file = compiler.settings
 -include ${config_file}
 
-OBJ = typedef.o rbmat.o mpi_transpose.o ffts.o dnsdata.o runtime.o
+OBJ = typedef.o rbmat.o mpi_transpose.o ffts.o dnsdata.o
 flags = -I$(FFTW_INC) -L$(FFTW_LIB) $(FLAGS)
 libs = -lfftw3
 

diff --git a/dnsdata.f90 b/dnsdata.f90
@@ -24,7 +24,7 @@ MODULE dnsdata
 
   !Simulation parameters
   real(C_DOUBLE) :: PI=3.1415926535897932384626433832795028841971
-  integer(C_INT) :: nx,ny,nz,nxd,nzd
+  integer(C_INT) :: ny,nz,nxd
   real(C_DOUBLE) :: alfa0,beta0,ni,a,ymin,ymax,deltat,cflmax,time,time0=0,dt_field,dt_save,t_max,gamma
   real(C_DOUBLE) :: u0,uN
   logical :: CPI
@@ -509,7 +509,8 @@ SUBROUTINE convolutions(iy,i,compute_cfl,in_timeloop,ODE)
      DO iV=1,3
        CALL IFT(VVdz(1:nzd,1:nxB,iV,i))  
 #ifdef nonblockingXZ
-       CALL MPI_IAlltoall(VVdz(:,:,iV,i), 1, Mdz, VVdx(:,:,iV,i), 1, Mdx, MPI_COMM_X, Rs(iV))
+       CALL zTOx(VVdz(:,:,iV,i),VVdx(:,:,iV,i),Rs) 
+       !MPI_IAlltoall(VVdz(:,:,iV,i), 1, Mdz, VVdx(:,:,iV,i), 1, Mdx, MPI_COMM_X, Rs(iV))
 #endif
 #ifdef convvel
        IF (compute_convvel) THEN
@@ -529,7 +530,8 @@ SUBROUTINE convolutions(iy,i,compute_cfl,in_timeloop,ODE)
        END IF
 #endif
 #ifndef nonblockingXZ
-       CALL MPI_Alltoall(VVdz(:,:,iV,i), 1, Mdz, VVdx(:,:,iV,i), 1, Mdx, MPI_COMM_X)
+       CALL zTOx(VVdz(:,:,iV,i),VVdx(:,:,iV,i))
+       !CALL MPI_Alltoall(VVdz(:,:,iV,i), 1, Mdz, VVdx(:,:,iV,i), 1, Mdx, MPI_COMM_X)
        VVdx(nx+2:nxd+1,1:nzB,iV,i)=0;    CALL RFT(VVdx(1:nxd+1,1:nzB,iV,i),rVVdx(1:2*nxd+2,1:nzB,iV,i))
 #endif
      END DO
@@ -568,7 +570,8 @@ SUBROUTINE convolutions(iy,i,compute_cfl,in_timeloop,ODE)
        END DO
        DO iV=1,3
          CALL HFT(rFdx(1:2*nxd+2,1:nzB,iV,1), Fdx(1:nxd+1,1:nzB,iV,1))
-         CALL MPI_Alltoall(Fdx(:,:,iV,1), 1, Mdx, Fdz(:,:,iV,1), 1, Mdz, MPI_COMM_X)
+         CALL xTOz(Fdx(:,:,iV,1),Fdz(:,:,iV,1))
+         !CALL MPI_Alltoall(Fdx(:,:,iV,1), 1, Mdx, Fdz(:,:,iV,1), 1, Mdz, MPI_COMM_X)
          CALL FFT(Fdz(1:nzd,1:nxB,iV,1))
        END DO
        F(iy,0:nz,nx0:nxN,1:3)=Fdz(1:nz+1,1:nxB,1:3,1)*factor
@@ -582,10 +585,12 @@ SUBROUTINE convolutions(iy,i,compute_cfl,in_timeloop,ODE)
      DO iV=1,6
        CALL HFT(rVVdx(1:2*nxd+2,1:nzB,iV,i),VVdx(1:nxd+1,1:nzB,iV,i)); 
 #ifndef nonblockingXZ
-       CALL MPI_Alltoall(VVdx(:,:,iV,i), 1, Mdx, VVdz(:,:,iV,i), 1, Mdz, MPI_COMM_X)
+       CALL xTOz(VVdx(:,:,iV,i), VVdz(:,:,iV,i))
+       !CALL MPI_Alltoall(VVdx(:,:,iV,i), 1, Mdx, VVdz(:,:,iV,i), 1, Mdz, MPI_COMM_X)
        CALL FFT(VVdz(1:nzd,1:nxB,iV,i));
 #else 
-       CALL MPI_IAlltoall(VVdx(:,:,iV,i), 1, Mdx, VVdz(:,:,iV,i), 1, Mdz, MPI_COMM_X, Rs(iV))
+       CALL xTOz(VVdx(:,:,iV,i), VVdz(:,:,iV,i), Rs(iV))
+       !CALL MPI_IAlltoall(VVdx(:,:,iV,i), 1, Mdx, VVdz(:,:,iV,i), 1, Mdz, MPI_COMM_X, Rs(iV))
 #endif
      END DO
 #ifdef nonblockingXZ

diff --git a/header.h b/header.h
@@ -1,49 +1,61 @@
 ! ==============================================
 ! C preprocessor defines
 ! ==============================================
+! Which mpi transpose do you want?
+!#define packunpack
+
 ! Stop program when warnings or interactive prompts
 ! are met; suggested for execution on cluster.
-#define warnings_are_fatal !      <-----------------------------------------------------------------------------------------
+!#define warnings_are_fatal !      <-----------------------------------------------------------------------------------------
+
 ! Use nonblocking communication in XZ directions
-#define nonblockingXZ
+!#define nonblockingXZ
+
 ! Use nonblocking communication in Y direction
 ! only if the program is NOT uiuj_*
 ! (uiuj does not support nonblocking comm in Y)
-#ifndef forceblockingY
-#define nonblockingY
-#endif
-! Force (nxd,nzd) to be at most the product of a
+!#ifndef forceblockingY
+!#define nonblockingY
+!#endif
+
+	! Force (nxd,nzd) to be at most the product of a
 ! power of 2 and a single factor 3
 #define useFFTfit
+
 ! half or full channel
 !#define halfchannel
+
 ! Add a bodyforce 
 !#define bodyforce
-#define BODYFORCE_HEADER "body_forces/am_f1/am_pardec.inc"
-#define BODYFORCE_MODULES "body_forces/am_f1/am_f1.inc"
+!#define BODYFORCE_HEADER "body_forces/am_f1/am_pardec.inc"
+!#define BODYFORCE_MODULES "body_forces/am_f1/am_f1.inc"
+
 ! define a bodyforce in space (ibm)
 !#define ibm
+
 ! Measure per timestep execution time
 #define chron
+
 ! Distrubute processes among processor so that
 ! the y-dierction is contiguous 
 #define ycontiguous
+
 ! Compute convection velocity
 !#define convvel
+
 ! Verbose echo of parallel parameters
 #define mpiverbose
+
 ! Disable code optimisation where possible (FFTW),
 ! in order to get code whose behaviour can be replicated
 ! exactly with different parallelisations. Useful for 
 ! testing.
 !#define no_optimising_code
-! Files and settings for runtime calculation and disk dump of statistics
-#define runtime_avoid_savefld
-#define HEADER_RUNTIME "runtime_plugin/instabudget/header.inc"
-#define RUNTIME_SETUP_SUBROUTINE "runtime_plugin/instabudget/setup.inc"
-#define RUNTIME_FINALISE_SUBROUTINE "runtime_plugin/instabudget/finalise.inc"
-#define RUNTIME_SAVE_SUBROUTINE "runtime_plugin/instabudget/save.inc"
-#define RUNTIME_AUXILIARY_SUBROUTINES "runtime_plugin/instabudget/aux.inc"
+
+! This avoids to save Dati.cart.out every dt_field during
+! the simulation. The file is saved only at the end of
+! runtime
+!#define runtime_avoid_savefld
 
 
 ! ==============================================

diff --git a/mpi_transpose.f90 b/mpi_transpose.f90
@@ -16,16 +16,21 @@
 MODULE mpi_transpose
 
   USE, intrinsic :: iso_c_binding
+  USE, intrinsic :: iso_fortran_env
   USE typedef
   USE mpi_f08
 
-  TYPE(MPI_Comm) :: MPI_COMM_X, MPI_COMM_Y, MPI_COMM_NEXT, MPI_COMM_PREV
-  integer(C_INT),save :: nproc,iproc,ierr,npx,ipx,npy,ipy
-  integer(C_INT), save :: nx0,nxN,nxB,nz0,nzN,nzB,block
+  TYPE(MPI_Comm) :: MPI_CART_COMM, MPI_COMM_X, MPI_COMM_Y 
+  integer(C_INT), save :: nproc,iproc,ierr,npx,ipx,npy,ipy,nzd,nx
+  integer(C_INT), save :: nx0,nxN,nxB,nz0,nzN,nzB,block,nxpp
   integer(C_INT), save :: ny0,nyN,miny,maxy
   integer(C_INT), save :: TAG_LUDECOMP=100, TAG_LUDIVSTEP1=101, TAG_LUDIVSTEP2=102, TAG_DUDY=103
+  complex(C_DOUBLE_COMPLEX), allocatable :: Ain(:),Aout(:)
   logical, save :: first,last,has_terminal,has_average
-  TYPE(MPI_Datatype), save :: Mdz,Mdx,cmpl,writeview_type,owned2write_type,vel_read_type,vel_field_type
+  TYPE(MPI_Datatype), save :: writeview_type,owned2write_type,vel_read_type,vel_field_type
+#ifndef packunpack
+  TYPE(MPI_Datatype), save :: Mdz,Mdx,cmpl
+#endif
 #ifdef nonblockingY
   ! Array of requests for nonblocking communication in linsolve
   TYPE(MPI_REQUEST), allocatable :: REQlinSolve(:), REQvetaTOuvw(:)
@@ -38,14 +43,147 @@ MODULE mpi_transpose
 
 CONTAINS
 
+! First we get the pack/unpack version of the transpose
+! in blocking and nonblocking version
+#ifdef packunpack
+#ifndef nonblockingXZ
+  !-------------- Transpose: Z to X --------------!
+  !-----------------------------------------------!
+  SUBROUTINE zTOx(Vz, Vx)  
+    complex(C_DOUBLE_COMPLEX), intent(in)  :: Vz(1:,1:)
+    complex(C_DOUBLE_COMPLEX), intent(out) :: Vx(1:,1:)
+    integer(C_SIZE_T) :: i,j
+    integer(C_SIZE_T) :: ix,iz,one,num,zero,npxt
+    integer(C_INT) :: ierr
+
+    one = 1
+    num = nzd/npx
+    zero = 0
+    npxt = npx
+
+    Ain=0; Aout=0
+
+    !Pack
+    i=zero
+    DO j=zero,npxt-one
+      DO ix=nx0,nxN
+        Ain(i:i+num-one)=Vz(one+j*num:(j+one)*num,ix-nx0+one)
+        i=i+num
+      END DO
+    END DO
+
+    !Send
+    CALL MPI_Alltoall(Ain, nxB*nzB, MPI_DOUBLE_COMPLEX, Aout, nxB*nzB, MPI_DOUBLE_COMPLEX, MPI_COMM_X)
+
+    !num = (nx+1)/npx
+    !Unpack
+    i=zero
+    DO ix=0,nx
+      Vx(ix+one,one:num)=Aout(i:i+num-one)
+      i=i+num
+    END DO
+  END SUBROUTINE zTOx
+
+
+  !-------------- Transpose: X to Z --------------!
+  !-----------------------------------------------!
+  SUBROUTINE xTOz(Vx,Vz)
+    complex(C_DOUBLE_COMPLEX), intent(out) :: Vz(1:,1:)
+    complex(C_DOUBLE_COMPLEX), intent(in) :: Vx(1:,1:)
+    integer(C_SIZE_T) :: i,j
+    integer(C_SIZE_T) :: ix,iz,one,num,zero,npxt
+
+    one = 1
+    num = (nx+1)/npx
+    zero = 0
+    npxt = npx
+
+    !Pack
+    i=zero
+    DO j=zero,npxt-one
+      DO iz=nz0,nzN
+        Ain(i:i+num-one)=Vx(one+j*num:(j+one)*num,iz-nz0+one)
+        i=i+num
+      END DO
+    END DO
+
+    !Send
+    CALL MPI_Alltoall(Ain, nxB*nzB, MPI_DOUBLE_COMPLEX, Aout, nxB*nzB, MPI_DOUBLE_COMPLEX, MPI_COMM_X)
+
+    !Unpack
+    i=zero
+    DO iz=0,nzd-1
+      Vz(iz+one,one:num)=Aout(i:i+num-one)
+      i=i+num
+    END DO
+  END SUBROUTINE xTOz
+#else 
+#error "No nonblockingXZ version of the pack/unpack transform yet!"
+#endif
+
+#else
+! Then we get the datatype version of the transpose
+! in blocking and nonblocking version
+#ifndef nonblockingXZ
+  !-------------- Transpose: Z to X --------------!
+  !-----------------------------------------------!
+  SUBROUTINE zTOx(Vz, Vx)  
+    complex(C_DOUBLE_COMPLEX), intent(in)  :: Vz(1:,1:)
+    complex(C_DOUBLE_COMPLEX), intent(out) :: Vx(1:,1:)
+
+    CALL MPI_Alltoall(Vz(:,:), 1, Mdz, Vx(:,:), 1, Mdx, MPI_COMM_X)
+
+  END SUBROUTINE zTOx
+
+
+  !-------------- Transpose: X to Z --------------!
+  !-----------------------------------------------!
+  SUBROUTINE xTOz(Vx,Vz)
+    complex(C_DOUBLE_COMPLEX), intent(out) :: Vz(1:,1:)
+    complex(C_DOUBLE_COMPLEX), intent(in) :: Vx(1:,1:)
+
+    CALL MPI_Alltoall(Vx(:,:), 1, Mdx, Vz(:,:), 1, Mdz, MPI_COMM_X)
+
+  END SUBROUTINE xTOz
+#else
+  !-------------- Transpose: Z to X --------------!
+  !-----------------------------------------------!
+  SUBROUTINE zTOx(Vz, Vx)  
+    complex(C_DOUBLE_COMPLEX), intent(in)  :: Vz(1:,1:)
+    complex(C_DOUBLE_COMPLEX), intent(out) :: Vx(1:,1:)
+    type(MPI_REQUEST), intent(inout) :: Rs
+
+    CALL MPI_IAlltoall(Vz(:,:), 1, Mdz, Vx(:,:), 1, Mdx, MPI_COMM_X, Rs)
+
+  END SUBROUTINE zTOx
+
+
+  !-------------- Transpose: X to Z --------------!
+  !-----------------------------------------------!
+  SUBROUTINE xTOz(Vx,Vz)
+    complex(C_DOUBLE_COMPLEX), intent(out) :: Vz(1:,1:)
+    complex(C_DOUBLE_COMPLEX), intent(in) :: Vx(1:,1:)
+    type(MPI_REQUEST), intent(inout) :: Rs
+
+    CALL MPI_IAlltoall(Vx(:,:), 1, Mdx, Vz(:,:), 1, Mdz, MPI_COMM_X, Rs)
+
+  END SUBROUTINE xTOz
+#endif
+#endif
+
+
   !------- Divide the problem in 1D slices -------! 
   !-----------------------------------------------!
-  SUBROUTINE init_MPI(nxpp,nz,ny,nxd,nzd)
-    integer(C_INT), intent(in)  :: nxpp,nz,ny,nxd,nzd
-    TYPE(MPI_Datatype) :: row,column,tmp
-    integer(kind=MPI_ADDRESS_KIND) :: stride,lb
+  SUBROUTINE init_MPI(nx,nz,ny,nxd,nzd)
+    integer(C_INT), intent(in)  :: nx,nz,ny,nxd,nzd
     integer, parameter :: ndims = 4 
     integer :: array_of_sizes(ndims), array_of_subsizes(ndims), array_of_starts(ndims), ierror
+    integer(C_INT) :: i,j,dims(1:2),coords(1:2)
+    logical :: periods(1:2)=.false.,reorder=.true.
+    TYPE(MPI_Datatype), save :: row, column, tmp
+    integer(kind=MPI_ADDRESS_KIND) :: stride,lb
+    integer(C_INT), allocatable, dimension(:) :: rankW, rankX, rankY
+    nxpp=nx+1
     ! Define which process write on screen
     has_terminal=(iproc==0)
 #ifdef nonblockingY
@@ -56,11 +194,14 @@ SUBROUTINE init_MPI(nxpp,nz,ny,nxd,nzd)
 #endif
     ! Processor decomposition
 #ifdef ycontiguous
-    npx=nproc/npy; ipx=iproc/npy; 
-    CALL MPI_Comm_split(MPI_COMM_WORLD, ipx, iproc, MPI_COMM_Y) ! Communicator with processes holding same X-slab
-    CALL MPI_Comm_rank(MPI_COMM_Y, ipy)
+    npx=nproc/npy;  
+    dims(1)=npx; dims(2)=npy
+    CALL MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, reorder, MPI_CART_COMM)
+    CALL MPi_Cart_coords(MPI_CART_COMM, iproc, 2, coords)
+    ipx=coords(1); ipy=coords(2)
     first=(ipy==0); last=(ipy==(npy-1))
-    CALL MPI_Comm_split(MPI_COMM_WORLD, ipy, iproc, MPI_COMM_X) ! Communicator with processes holding same Y-slab
+    CALL MPI_Comm_split(MPI_CART_COMM, ipx, iproc, MPI_COMM_Y) ! Communicator with processes holding same X-slab
+    CALL MPI_Comm_split(MPI_CART_COMM, ipy, iproc, MPI_COMM_X) ! Communicator with processes holding same Y-slab
 #else
     npx=nproc/npy; ipy=iproc/npx; first=(ipy==0); last=(ipy==(npy-1));
     CALL MPI_Comm_split(MPI_COMM_WORLD, ipy, iproc, MPI_COMM_X) ! Communicator with processes holding same Y-slab
@@ -79,8 +220,17 @@ SUBROUTINE init_MPI(nxpp,nz,ny,nxd,nzd)
     has_average=(nx0==0)
     ! Communicators only with previous and next in Y
 #ifdef mpiverbose
-    WRITE(*,*) "iproc=",iproc,"ipx=",ipx,"ipy=",ipy, "nx0=",nx0," nxN=",nxN," nxB=",nxB, "nz0=",nz0," nzN=",nzN," nzB=",nzB, "ny0=", ny0, "nyN=", nyN 
+    DO i=0,nproc-1
+       IF (iproc==i) WRITE(*,*) "iproc=",iproc,"ipx=",ipx,"ipy=",ipy, "nx0=",nx0," nxN=",nxN," nxB=",nxB, "nz0=",nz0," nzN=",nzN," nzB=",nzB, "ny0=", ny0, "nyN=", nyN 
+       CALL MPI_Barrier(MPI_COMM_WORLD)
+    END DO
+    FLUSH(output_unit)
 #endif
+#ifdef packunpack
+    ! Allocate buffers for transposes
+    ALLOCATE(Ain(0:block-1)); Ain=0
+    ALLOCATE(Aout(0:block-1)); Aout=0
+#else
     ! MPI derived datatyped - basics
     CALL MPI_Type_contiguous(2,MPI_DOUBLE_PRECISION,cmpl)   !complex
     CALL MPI_Type_commit(cmpl)
@@ -92,6 +242,7 @@ SUBROUTINE init_MPI(nxpp,nz,ny,nxd,nzd)
     lb=0; stride=8*2;  CALL MPI_Type_create_resized(column,lb,stride,tmp)
     CALL MPI_Type_contiguous(nxB,tmp,Mdx)
     CALL MPI_Type_commit(Mdx)
+#endif
     ! For READING VELOCITY, SETTING VIEW: datatype that maps velocity on disk to memory (it differs from writing: halo cells are read twice!)
     CALL MPI_Type_create_subarray(ndims, [ny+3, 2*nz+1, nxpp, 3], [nyN-ny0+5, 2*nz+1, nxB, 3], [ny0-1,0,nx0,0], MPI_ORDER_FORTRAN, MPI_DOUBLE_COMPLEX, vel_read_type, ierror)
     CALL MPI_Type_commit(vel_read_type, ierror)

diff --git a/postpro/am/camstar.f90 b/postpro/am/camstar.f90
@@ -344,7 +344,8 @@ subroutine scale_filter(iy1, iy2)
         do icnt=1,2
             do iV=1,4
                 call IFT(VVdz(:,:,iV,icnt)) ! first transform in z
-                call MPI_Alltoall(VVdz(:,:,iv,icnt), 1, Mdz, VVdx(:,:,iv,icnt), 1, Mdx, MPI_COMM_X)
+                call zTOx(VVdz(:,:,iv,icnt),VVdx(:,:,iv,icnt))
+                !call MPI_Alltoall(VVdz(:,:,iv,icnt), 1, Mdz, VVdx(:,:,iv,icnt), 1, Mdx, MPI_COMM_X)
                 VVdx(nx+2:nxd+1,1:nzB,iV,icnt)=0
                 call RFT(VVdx(:,:,iV,icnt),rVVdx(:,:,iV,icnt)) ! second transform in x
             end do
@@ -596,4 +597,4 @@ subroutine watermark()
 
 
 
-end program
+end program