Skip to content

Commit

Permalink
QPhiX integration:
Browse files Browse the repository at this point in the history
Progress:
   -Dslash works with packed pointers: funny -- single line dslash worked, multi-line not
   -As of this commit, LLVMClover term works
   -Lots of diagnostics added to testClovDslashFull -- some may need backing out.

Issues:
   - calling .elem() in OpenMP regions is a No-No with QDP-JIT - OpenMP
      + ultimately want raw pointer access for multi-threaded imports
      + however regular QDP++ impoting should still be multithreaded -- needs a tidy
   - on KNL with QDP-JIT running with -inner 2 or -inner 4 caused incorrect results applying LLVMCloverTerm
     results were correct for -inner 8

ToDo:
    - Tidy Clover Tests
    - Add raw pointer access to spinors, clover term and gauge fields for import. New packer.h file? e.g. qdpjit_packers.h
  • Loading branch information
Balint Joo committed Jun 23, 2016
1 parent 583a1c6 commit 3e14b69
Show file tree
Hide file tree
Showing 13 changed files with 1,364 additions and 298 deletions.
199 changes: 147 additions & 52 deletions include/qphix/qdp_packer.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,55 +44,143 @@ namespace QPhiX {
u_minus[mu] = shift(u[mu], BACKWARD, mu);
}


#pragma omp parallel for collapse(4)
for(int t = 0; t < Nt; t++) {
for(int z = 0; z < Nz; z++) {
for(int y = 0; y < Ny; y++) {
for(int s = 0; s < nvecs; s++) {
for(int mu = 0; mu < 4; mu++) {
int outer_c = 3;
if ( compress ) {
outer_c = 2;
}
for(int c = 0; c < outer_c; c++) {
for(int c2 = 0; c2 < 3; c2++) {
for(int x = 0; x < soalen; x++) {
std::cout << "QDP Shift is complete. Entering OpenMP Pack Region" << std::endl;

// No Elem calls in parallel regions
// #pragma omp parallel for collapse(4)
for(int t = 0; t < Nt; t++) {
for(int z = 0; z < Nz; z++) {
for(int y = 0; y < Ny; y++) {
for(int s = 0; s < nvecs; s++) {
for(int mu = 0; mu < 4; mu++) {
int outer_c = 3;
if ( compress ) {
outer_c = 2;
}
for(int c = 0; c < outer_c; c++) {
for(int c2 = 0; c2 < 3; c2++) {
for(int x = 0; x < soalen; x++) {

//#ifndef USE_PACKED_GAUGES
//int xx = x;
//int block = ((t*Nz+z)*Ny+y)*nvecs+s;

//#endif
//#else // USE_PACKED_GAUGES
int block = (t*Pxyz+z*Pxy)/nyg+(y/nyg)*nvecs+s;
int xx = (y%nyg)*soalen+x;
// #endif // USE_PACKED_GAUGES

int qdpsite = x + soalen*(s + nvecs*(y + Ny*(z + Nz*t)));
u_cb0[block][2*mu][c][c2][0][xx] = u_minus[mu].elem(rb[0].start() + qdpsite).elem().elem(c2,c).real();
u_cb0[block][2*mu][c][c2][1][xx] = u_minus[mu].elem(rb[0].start() + qdpsite).elem().elem(c2,c).imag();
u_cb0[block][2*mu+1][c][c2][0][xx] = u[mu].elem(rb[0].start() + qdpsite).elem().elem(c2,c).real();
u_cb0[block][2*mu+1][c][c2][1][xx] = u[mu].elem(rb[0].start() + qdpsite).elem().elem(c2,c).imag();

u_cb1[block][2*mu][c][c2][0][xx] = u_minus[mu].elem(rb[1].start() + qdpsite).elem().elem(c2,c).real();
u_cb1[block][2*mu][c][c2][1][xx] = u_minus[mu].elem(rb[1].start() + qdpsite).elem().elem(c2,c).imag();
u_cb1[block][2*mu+1][c][c2][0][xx] = u[mu].elem(rb[1].start() + qdpsite).elem().elem(c2,c).real();
u_cb1[block][2*mu+1][c][c2][1][xx] = u[mu].elem(rb[1].start() + qdpsite).elem().elem(c2,c).imag();
}
}
}
}
}
}
}
}

QDPIO::cout << "Leaving Gauge PAck" << std::endl;
}

//#ifndef USE_PACKED_GAUGES
//int xx = x;
//int block = ((t*Nz+z)*Ny+y)*nvecs+s;
#ifdef QPHIX_BUILD_CLOVER

//#endif
//#else // USE_PACKED_GAUGES
int block = (t*Pxyz+z*Pxy)/nyg+(y/nyg)*nvecs+s;
int xx = (y%nyg)*soalen+x;
// #endif // USE_PACKED_GAUGES
#ifdef QPHIX_BUILD_QDPJIT
// extern int64_t getDataLayoutInnerSize();

int qdpsite = x + soalen*(s + nvecs*(y + Ny*(z + Nz*t)));
u_cb0[block][2*mu][c][c2][0][xx] = u_minus[mu].elem(rb[0].start() + qdpsite).elem().elem(c2,c).real();
u_cb0[block][2*mu][c][c2][1][xx] = u_minus[mu].elem(rb[0].start() + qdpsite).elem().elem(c2,c).imag();
u_cb0[block][2*mu+1][c][c2][0][xx] = u[mu].elem(rb[0].start() + qdpsite).elem().elem(c2,c).real();
u_cb0[block][2*mu+1][c][c2][1][xx] = u[mu].elem(rb[0].start() + qdpsite).elem().elem(c2,c).imag();


u_cb1[block][2*mu][c][c2][0][xx] = u_minus[mu].elem(rb[1].start() + qdpsite).elem().elem(c2,c).real();
u_cb1[block][2*mu][c][c2][1][xx] = u_minus[mu].elem(rb[1].start() + qdpsite).elem().elem(c2,c).imag();
u_cb1[block][2*mu+1][c][c2][0][xx] = u[mu].elem(rb[1].start() + qdpsite).elem().elem(c2,c).real();
u_cb1[block][2*mu+1][c][c2][1][xx] = u[mu].elem(rb[1].start() + qdpsite).elem().elem(c2,c).imag();
}
}
}
}
}
}
}
}
}
// This accesses the Internals of the LLVMCloverTerm
template<typename FT, int veclen, int soalen, bool compress, typename ClovTerm>
void qdp_pack_clover(const ClovTerm& qdp_clov_in,
typename ClovDslash<FT,veclen,soalen,compress>::CloverBlock* cl_out,Geometry<FT,veclen,soalen,compress>& s, int cb)
{
// Get the subgrid latt size.
int Nt = s.Nt();
int Nz = s.Nz();
int Ny = s.Ny();
int nvecs = s.nVecs();
int nyg = s.nGY();
int Pxy = s.getPxy();
int Pxyz = s.getPxyz();

// Sanity Check
// QDP Type is
// Outer x 2 chiral blocks x 6 floats x inner sites
const typename ClovTerm::DiagType& diag_buf = qdp_clov_in.getDiagBuffer();
const typename ClovTerm::OffDiagType& off_diag_buf = qdp_clov_in.getOffDiagBuffer();

// const int qdp_inner_size =(int)getDataLayoutInnerSize();
// const int num_comp = 2;
// const int num_complex = 2;




// No elem calls in parallel region
//#pragma omp parallel for collapse(4)
for(int t = 0; t < Nt; t++) {
for(int z = 0; z < Nz; z++) {
for(int y = 0; y < Ny; y++) {
for(int s = 0; s < nvecs; s++) {
for(int x = 0; x < soalen; x++) {

int block = (t*Pxyz+z*Pxy)/nyg+(y/nyg)*nvecs+s;
int xx = (y%nyg)*soalen+x;

int qdpsite = x + soalen*(s + nvecs*(y + Ny*(z + Nz*t)))+rb[cb].start();
// int qdp_osite = qdpsite/qdp_inner_size;
// int qdp_isite = qdpsite%qdp_inner_size;


for(int d=0; d < 6; d++) {
cl_out[block].diag1[d][xx] = (FT)diag_buf.elem(qdpsite).comp[0].diag[d].elem().elem();
// cl_out[block].diag1[d][xx] = (FT)1/(FT)4.1;
}
for(int od=0; od < 15; od++) {
cl_out[block].off_diag1[od][RE][xx] = off_diag_buf.elem(qdpsite).comp[0].offd[od].real().elem();
cl_out[block].off_diag1[od][IM][xx] = off_diag_buf.elem(qdpsite).comp[0].offd[od].imag().elem();

// cl_out[block].off_diag1[od][RE][xx] = 0;
// cl_out[block].off_diag1[od][IM][xx] = 0;


}

for(int d=0; d < 6; d++) {
cl_out[block].diag2[d][xx] = diag_buf.elem(qdpsite).comp[1].diag[d].elem().elem();
// cl_out[block].diag2[d][xx] = (FT)1/(FT)4.1;
}
for(int od=0; od < 15; od++) {

cl_out[block].off_diag2[od][RE][xx] = off_diag_buf.elem(qdpsite).comp[1].offd[od].real().elem();
cl_out[block].off_diag2[od][IM][xx] = off_diag_buf.elem(qdpsite).comp[1].offd[od].imag().elem();

// cl_out[block].off_diag2[od][RE][xx] = 0;
// cl_out[block].off_diag2[od][IM][xx] = 0;


}
}
}
}
}
}
}


#else

#ifdef QPHIX_BUILD_CLOVER
template<typename FT, int veclen, int soalen, bool compress, typename ClovTerm>
void qdp_pack_clover(const ClovTerm& qdp_clov_in,
void qdp_pack_clover(const ClovTerm& clov_in,
typename ClovDslash<FT,veclen,soalen,compress>::CloverBlock* cl_out,Geometry<FT,veclen,soalen,compress>& s, int cb)
{
// Get the subgrid latt size.
Expand All @@ -104,6 +192,9 @@ namespace QPhiX {
int Pxy = s.getPxy();
int Pxyz = s.getPxyz();

auto qdp_clov_in = clov_in.getTriBuffer();


#pragma omp parallel for collapse(4)
for(int t = 0; t < Nt; t++) {
for(int z = 0; z < Nz; z++) {
Expand All @@ -116,19 +207,20 @@ namespace QPhiX {
int qdpsite = x + soalen*(s + nvecs*(y + Ny*(z + Nz*t)))+rb[cb].start();

for(int d=0; d < 6; d++) {
cl_out[block].diag1[d][xx]=qdp_clov_in[qdpsite].diag[0][d].elem();
cl_out[block].diag1[d][xx]=qdp_clov_in[qdpsite].diag[0][d].elem();
}
for(int od=0; od < 15; od++) {
cl_out[block].off_diag1[od][RE][xx]=qdp_clov_in[qdpsite].offd[0][od].real();
cl_out[block].off_diag1[od][IM][xx]=qdp_clov_in[qdpsite].offd[0][od].imag();
cl_out[block].off_diag1[od][RE][xx]=qdp_clov_in[qdpsite].offd[0][od].real();
cl_out[block].off_diag1[od][IM][xx]=qdp_clov_in[qdpsite].offd[0][od].imag();
}

for(int d=0; d < 6; d++) {
cl_out[block].diag2[d][xx]=qdp_clov_in[qdpsite].diag[1][d].elem();
cl_out[block].diag2[d][xx]=qdp_clov_in[qdpsite].diag[1][d].elem();

}
for(int od=0; od < 15; od++) {
cl_out[block].off_diag2[od][RE][xx]=qdp_clov_in[qdpsite].offd[1][od].real();
cl_out[block].off_diag2[od][IM][xx]=qdp_clov_in[qdpsite].offd[1][od].imag();
cl_out[block].off_diag2[od][RE][xx]=qdp_clov_in[qdpsite].offd[1][od].real();
cl_out[block].off_diag2[od][IM][xx]=qdp_clov_in[qdpsite].offd[1][od].imag();
}
}
}
Expand All @@ -137,6 +229,7 @@ namespace QPhiX {
}
}

#endif
#endif // IFDEF BUILD CLOVER


Expand All @@ -155,7 +248,8 @@ namespace QPhiX {
int Pxy = s.getPxy();
int Pxyz = s.getPxyz();

#pragma omp parallel for collapse(4)
// No elem in OpenMP parallel region
//#pragma omp parallel for collapse(4)
for(int t=0; t < Nt; t++) {
for(int z=0; z < Nz; z++) {
for(int y=0; y < Ny; y++) {
Expand Down Expand Up @@ -253,7 +347,8 @@ namespace QPhiX {
int Pxyz = s.getPxyz();


#pragma omp parallel for collapse(4)
// No elem() in OpenMP parallel
//#pragma omp parallel for collapse(4)
for(int t=0; t < Nt; t++) {
for(int z=0; z < Nz; z++) {
for(int y=0; y < Ny; y++) {
Expand Down
Loading

0 comments on commit 3e14b69

Please sign in to comment.