From 73947871e8dcd11dc31af725dfa8cbb88527427e Mon Sep 17 00:00:00 2001 From: Balint Joo Date: Fri, 24 Jun 2016 22:55:54 -0400 Subject: [PATCH] Initial integration with QDP-JIT - packers are fully threaded and use raw QDPJIT pointer access. - packers work only when QDP-JIT is run with -layout ocsri Todo: - recheck integration has not broken non QDP-JIT build - bulletproof (with assertions) to ensure layout is ocsri, or adapta also to oscri - Use as 'native' dslash: only possible in ocsri mode since that is what is wired into QDPJIT - lots of testing. - test full chroma QDP-JIT build (e.g. Leapfrog) --- NEWS | 1 + TODO | 9 +- include/qphix/Makefile.am | 2 +- include/qphix/qdp_packer.h | 398 +-------------------------- include/qphix/qdp_packer_parscalar.h | 355 +----------------------- include/qphix/qdp_packer_qdpjit.h | 339 +++++++++++++++++++++++ tests/testClovDslashFull.cc | 15 +- 7 files changed, 362 insertions(+), 757 deletions(-) create mode 100644 include/qphix/qdp_packer_qdpjit.h diff --git a/NEWS b/NEWS index 91dc768f..9a121876 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,4 @@ +6/24/2015 (Balint Joo/Thorsten Kurth) - Added lots of cleanup for KNL. Added packers for QDP-JIT (OCSRI layout for now) 1/12/2016 (Balint Joo/Aaron Walden) - Added AVX512 output from code generator. Tested on Intel SDE. Single and Double precision work, 16-bit precision is currently broken. 4/9/2015 (Balint Joo): Added Basic Multi-Shift solvers -- but can still add more optimizations. Need to test multi-node etc. diff --git a/TODO b/TODO index 9e560720..0cad6154 100644 --- a/TODO +++ b/TODO @@ -1,10 +1,13 @@ +To do with QDPJIT: + - Thorough testing for SOA combinations (initially done with 4) + - Bullet proofing assertions that data is in OCSRI layout or make the code generically figure out the right layout. + - Write native dslash using raw pointers: need OSCRI layout assertion and InnerLength assertion (easy) + - but issue w.r.t currently onle inner=8 from QDPJIT apparently working in my tests + TO Do: - Add cache blocking in X -- hooks are already present in the kernels, but need to adapt loops. -- Add other solvers specially multi-shift - Solve N-systems at once (ongoing PhD project at Old Dominion University Computer Science Department: ODU-JLab collaboration) - Clean code for non ICC compilers (work ongoing by Diptorup Deb, Renaissance Computing Institute, University of North Carolina, Chapel Hill -- Add other processor targets: SSE, AVX2, AVX512 - codegen supports these already but need to be added -- Add other fermions: e.g. Twisted Mass, HiSQ, Domain Wall? - Bullet proof a little (A LOT!!!) - Better tuning (implement 1/2/3/4 threads per core instead of just all 4) - reduce verbosite and use master printf to better effect (log levels?) diff --git a/include/qphix/Makefile.am b/include/qphix/Makefile.am index 296b7ad4..c64432f9 100644 --- a/include/qphix/Makefile.am +++ b/include/qphix/Makefile.am @@ -39,7 +39,7 @@ qphix_include_HEADERS += \ # if QPHIX_BUILD_WITH_QDP -qphix_include_HEADERS += ./qdp_packer.h +qphix_include_HEADERS += ./qdp_packer.h ./qdp_packer_parscalar.h qdp_packer_qdpjit.h endif diff --git a/include/qphix/qdp_packer.h b/include/qphix/qdp_packer.h index 4dfc9ee7..47615404 100644 --- a/include/qphix/qdp_packer.h +++ b/include/qphix/qdp_packer.h @@ -9,12 +9,10 @@ #include "qphix/dslash_def.h" #include "qphix/qphix_config.h" -#if defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE) -#include -#endif - -#ifdef QPHIX_BUILD_CLOVER -#include "qphix/clover_dslash_def.h" +#ifdef QPHIX_BUILD_QDPJIT +#include "qphix/qdp_packer_qdpjit.h" +#else +#include "qphix/qdp_packer_parscalar.h" #endif using namespace QDP; @@ -22,259 +20,6 @@ using namespace QDP; namespace QPhiX { - template - void qdp_pack_gauge(const QDPGauge& u, - typename Geometry::SU3MatrixBlock *u_cb0, - typename Geometry::SU3MatrixBlock *u_cb1, - Geometry& s) - { - // Get the subgrid latt size. - int Nt = s.Nt(); - int Nz = s.Nz(); - int Ny = s.Ny(); - int nvecs = s.nVecs(); - int nyg = s.nGY(); - int Pxy = s.getPxy(); - int Pxyz = s.getPxyz(); - - - // Shift the lattice to get U(x-mu) - QDPGauge u_minus(4); - for(int mu=0; mu < 4; mu++) { - u_minus[mu] = shift(u[mu], BACKWARD, mu); - } - - std::cout << "QDP Shift is complete. Entering OpenMP Pack Region" << std::endl; - -// No Elem calls in parallel regions -// #pragma omp parallel for collapse(4) - for(int t = 0; t < Nt; t++) { - for(int z = 0; z < Nz; z++) { - for(int y = 0; y < Ny; y++) { - for(int s = 0; s < nvecs; s++) { - for(int mu = 0; mu < 4; mu++) { - int outer_c = 3; - if ( compress ) { - outer_c = 2; - } - for(int c = 0; c < outer_c; c++) { - for(int c2 = 0; c2 < 3; c2++) { - for(int x = 0; x < soalen; x++) { - - //#ifndef USE_PACKED_GAUGES - //int xx = x; - //int block = ((t*Nz+z)*Ny+y)*nvecs+s; - - //#endif - //#else // USE_PACKED_GAUGES - int block = (t*Pxyz+z*Pxy)/nyg+(y/nyg)*nvecs+s; - int xx = (y%nyg)*soalen+x; - // #endif // USE_PACKED_GAUGES - - int qdpsite = x + soalen*(s + nvecs*(y + Ny*(z + Nz*t))); - u_cb0[block][2*mu][c][c2][0][xx] = u_minus[mu].elem(rb[0].start() + qdpsite).elem().elem(c2,c).real(); - u_cb0[block][2*mu][c][c2][1][xx] = u_minus[mu].elem(rb[0].start() + qdpsite).elem().elem(c2,c).imag(); - u_cb0[block][2*mu+1][c][c2][0][xx] = u[mu].elem(rb[0].start() + qdpsite).elem().elem(c2,c).real(); - u_cb0[block][2*mu+1][c][c2][1][xx] = u[mu].elem(rb[0].start() + qdpsite).elem().elem(c2,c).imag(); - - u_cb1[block][2*mu][c][c2][0][xx] = u_minus[mu].elem(rb[1].start() + qdpsite).elem().elem(c2,c).real(); - u_cb1[block][2*mu][c][c2][1][xx] = u_minus[mu].elem(rb[1].start() + qdpsite).elem().elem(c2,c).imag(); - u_cb1[block][2*mu+1][c][c2][0][xx] = u[mu].elem(rb[1].start() + qdpsite).elem().elem(c2,c).real(); - u_cb1[block][2*mu+1][c][c2][1][xx] = u[mu].elem(rb[1].start() + qdpsite).elem().elem(c2,c).imag(); - } - } - } - } - } - } - } - } - - QDPIO::cout << "Leaving Gauge PAck" << std::endl; - } - -#ifdef QPHIX_BUILD_CLOVER - -#ifdef QPHIX_BUILD_QDPJIT - // extern int64_t getDataLayoutInnerSize(); - - // This accesses the Internals of the LLVMCloverTerm - template - void qdp_pack_clover(const ClovTerm& qdp_clov_in, - typename ClovDslash::CloverBlock* cl_out,Geometry& s, int cb) - { - // Get the subgrid latt size. - int Nt = s.Nt(); - int Nz = s.Nz(); - int Ny = s.Ny(); - int nvecs = s.nVecs(); - int nyg = s.nGY(); - int Pxy = s.getPxy(); - int Pxyz = s.getPxyz(); - - // Sanity Check - // QDP Type is - // Outer x 2 chiral blocks x 6 floats x inner sites - const typename ClovTerm::DiagType& diag_buf = qdp_clov_in.getDiagBuffer(); - const typename ClovTerm::OffDiagType& off_diag_buf = qdp_clov_in.getOffDiagBuffer(); - - // const int qdp_inner_size =(int)getDataLayoutInnerSize(); - // const int num_comp = 2; - // const int num_complex = 2; - - - - - // No elem calls in parallel region - //#pragma omp parallel for collapse(4) - for(int t = 0; t < Nt; t++) { - for(int z = 0; z < Nz; z++) { - for(int y = 0; y < Ny; y++) { - for(int s = 0; s < nvecs; s++) { - for(int x = 0; x < soalen; x++) { - - int block = (t*Pxyz+z*Pxy)/nyg+(y/nyg)*nvecs+s; - int xx = (y%nyg)*soalen+x; - - int qdpsite = x + soalen*(s + nvecs*(y + Ny*(z + Nz*t)))+rb[cb].start(); - // int qdp_osite = qdpsite/qdp_inner_size; - // int qdp_isite = qdpsite%qdp_inner_size; - - - for(int d=0; d < 6; d++) { - cl_out[block].diag1[d][xx] = (FT)diag_buf.elem(qdpsite).comp[0].diag[d].elem().elem(); - // cl_out[block].diag1[d][xx] = (FT)1/(FT)4.1; - } - for(int od=0; od < 15; od++) { - cl_out[block].off_diag1[od][RE][xx] = off_diag_buf.elem(qdpsite).comp[0].offd[od].real().elem(); - cl_out[block].off_diag1[od][IM][xx] = off_diag_buf.elem(qdpsite).comp[0].offd[od].imag().elem(); - - // cl_out[block].off_diag1[od][RE][xx] = 0; - // cl_out[block].off_diag1[od][IM][xx] = 0; - - - } - - for(int d=0; d < 6; d++) { - cl_out[block].diag2[d][xx] = diag_buf.elem(qdpsite).comp[1].diag[d].elem().elem(); - // cl_out[block].diag2[d][xx] = (FT)1/(FT)4.1; - } - for(int od=0; od < 15; od++) { - - cl_out[block].off_diag2[od][RE][xx] = off_diag_buf.elem(qdpsite).comp[1].offd[od].real().elem(); - cl_out[block].off_diag2[od][IM][xx] = off_diag_buf.elem(qdpsite).comp[1].offd[od].imag().elem(); - - // cl_out[block].off_diag2[od][RE][xx] = 0; - // cl_out[block].off_diag2[od][IM][xx] = 0; - - - } - } - } - } - } - } - } - - -#else - - template - void qdp_pack_clover(const ClovTerm& clov_in, - typename ClovDslash::CloverBlock* cl_out,Geometry& s, int cb) - { - // Get the subgrid latt size. - int Nt = s.Nt(); - int Nz = s.Nz(); - int Ny = s.Ny(); - int nvecs = s.nVecs(); - int nyg = s.nGY(); - int Pxy = s.getPxy(); - int Pxyz = s.getPxyz(); - - auto qdp_clov_in = clov_in.getTriBuffer(); - - -#pragma omp parallel for collapse(4) - for(int t = 0; t < Nt; t++) { - for(int z = 0; z < Nz; z++) { - for(int y = 0; y < Ny; y++) { - for(int s = 0; s < nvecs; s++) { - for(int x = 0; x < soalen; x++) { - - int block = (t*Pxyz+z*Pxy)/nyg+(y/nyg)*nvecs+s; - int xx = (y%nyg)*soalen+x; - int qdpsite = x + soalen*(s + nvecs*(y + Ny*(z + Nz*t)))+rb[cb].start(); - - for(int d=0; d < 6; d++) { - cl_out[block].diag1[d][xx]=qdp_clov_in[qdpsite].diag[0][d].elem(); - } - for(int od=0; od < 15; od++) { - cl_out[block].off_diag1[od][RE][xx]=qdp_clov_in[qdpsite].offd[0][od].real(); - cl_out[block].off_diag1[od][IM][xx]=qdp_clov_in[qdpsite].offd[0][od].imag(); - } - - for(int d=0; d < 6; d++) { - cl_out[block].diag2[d][xx]=qdp_clov_in[qdpsite].diag[1][d].elem(); - - } - for(int od=0; od < 15; od++) { - cl_out[block].off_diag2[od][RE][xx]=qdp_clov_in[qdpsite].offd[1][od].real(); - cl_out[block].off_diag2[od][IM][xx]=qdp_clov_in[qdpsite].offd[1][od].imag(); - } - } - } - } - } - } - } - -#endif -#endif // IFDEF BUILD CLOVER - - - template - void qdp_pack_cb_spinor(const QDPSpinor& psi_in, - typename Geometry::FourSpinorBlock* psi, - Geometry& s, - int cb) - { - // Get the subgrid latt size. - int Nt = s.Nt(); - int Nz = s.Nz(); - int Ny = s.Ny(); - int Nxh = s.Nxh(); - int nvecs = s.nVecs(); - int Pxy = s.getPxy(); - int Pxyz = s.getPxyz(); - - // No elem in OpenMP parallel region -//#pragma omp parallel for collapse(4) - for(int t=0; t < Nt; t++) { - for(int z=0; z < Nz; z++) { - for(int y=0; y < Ny; y++) { - for(int s=0; s < nvecs; s++) { - for(int col=0; col < 3; col++) { - for(int spin=0; spin < 4; spin++) { - for(int x=0; x < soalen; x++) { - - int ind = t*Pxyz+z*Pxy+y*nvecs+s; //((t*Nz+z)*Ny+y)*nvecs+s; - int x_coord = s*soalen + x; - - int qdp_ind = ((t*Nz + z)*Ny + y)*Nxh + x_coord; - psi[ind][col][spin][0][x] = psi_in.elem(rb[cb].start()+qdp_ind).elem(spin).elem(col).real(); - psi[ind][col][spin][1][x] = psi_in.elem(rb[cb].start()+qdp_ind).elem(spin).elem(col).imag(); - - } - } - } - } - } - } - } - - } - template void qdp_pack_spinor(const QDPSpinor& psi_in, typename Geometry::FourSpinorBlock* psi_even, @@ -285,94 +30,6 @@ namespace QPhiX { qdp_pack_cb_spinor(psi_in,psi_odd,s,1); } -#if 0 - template - void qdp_pack_spinor(const QDPSpinor& psi_in, - typename Geometry::FourSpinorBlock* psi_even, - typename Geometry::FourSpinorBlock* psi_odd, - Geometry& s) - { - // Get the subgrid latt size. - int Nt = s.Nt(); - int Nz = s.Nz(); - int Ny = s.Ny(); - int Nxh = s.Nxh(); - int nvecs = s.nVecs(); - int Pxy = s.getPxy(); - int Pxyz = s.getPxyz(); - -#pragma omp parallel for collapse(4) - for(int t=0; t < Nt; t++) { - for(int z=0; z < Nz; z++) { - for(int y=0; y < Ny; y++) { - for(int s=0; s < nvecs; s++) { - for(int col=0; col < 3; col++) { - for(int spin=0; spin < 4; spin++) { - for(int x=0; x < soalen; x++) { - - int ind = t*Pxyz+z*Pxy+y*nvecs+s; //((t*Nz+z)*Ny+y)*nvecs+s; - int x_coord = s*soalen + x; - - int qdp_ind = ((t*Nz + z)*Ny + y)*Nxh + x_coord; - psi_even[ind][col][spin][0][x] = psi_in.elem(rb[0].start()+qdp_ind).elem(spin).elem(col).real(); - psi_even[ind][col][spin][1][x] = psi_in.elem(rb[0].start()+qdp_ind).elem(spin).elem(col).imag(); - - psi_odd[ind][col][spin][0][x] = psi_in.elem(rb[1].start()+qdp_ind).elem(spin).elem(col).real(); - psi_odd[ind][col][spin][1][x] = psi_in.elem(rb[1].start()+qdp_ind).elem(spin).elem(col).imag(); - - } - } - } - } - } - } - } - - } -#endif - - - template - void qdp_unpack_cb_spinor(typename Geometry::FourSpinorBlock* chi_packed, - QDPSpinor& chi, - Geometry& s, - int cb) - { - int Nt = s.Nt(); - int Nz = s.Nz(); - int Ny = s.Ny(); - int Nxh = s.Nxh(); - int nvecs = s.nVecs(); - int Pxy = s.getPxy(); - int Pxyz = s.getPxyz(); - - - // No elem() in OpenMP parallel -//#pragma omp parallel for collapse(4) - for(int t=0; t < Nt; t++) { - for(int z=0; z < Nz; z++) { - for(int y=0; y < Ny; y++) { - for(int s=0; s < nvecs; s++) { - for(int spin=0; spin < 4; spin++) { - for(int col=0; col < 3; col++) { - for(int x=0; x < soalen; x++) { - - int ind = t*Pxyz+z*Pxy+y*nvecs+s; //((t*Nz+z)*Ny+y)*nvecs+s; - int x_coord = s*soalen + x; - - int qdp_ind = ((t*Nz + z)*Ny + y)*Nxh + x_coord; - - chi.elem(rb[cb].start()+qdp_ind).elem(spin).elem(col).real() = chi_packed[ind][col][spin][0][x]; - chi.elem(rb[cb].start()+qdp_ind).elem(spin).elem(col).imag() = chi_packed[ind][col][spin][1][x]; - - } - } - } - } - } - } - } - } template void qdp_unpack_spinor(typename Geometry::FourSpinorBlock* chi_even, @@ -384,53 +41,6 @@ namespace QPhiX { qdp_unpack_cb_spinor(chi_odd,chi,s,1); } -#if 0 - template - void qdp_unpack_spinor(typename Geometry::FourSpinorBlock* chi_even, - typename Geometry::FourSpinorBlock* chi_odd, - QDPSpinor& chi, - Geometry& s) - { - - int Nt = s.Nt(); - int Nz = s.Nz(); - int Ny = s.Ny(); - int Nxh = s.Nxh(); - int nvecs = s.nVecs(); - int Pxy = s.getPxy(); - int Pxyz = s.getPxyz(); - - -#pragma omp parallel for collapse(4) - for(int t=0; t < Nt; t++) { - for(int z=0; z < Nz; z++) { - for(int y=0; y < Ny; y++) { - for(int s=0; s < nvecs; s++) { - for(int spin=0; spin < 4; spin++) { - for(int col=0; col < 3; col++) { - for(int x=0; x < soalen; x++) { - - int ind = t*Pxyz+z*Pxy+y*nvecs+s; //((t*Nz+z)*Ny+y)*nvecs+s; - int x_coord = s*soalen + x; - - int qdp_ind = ((t*Nz + z)*Ny + y)*Nxh + x_coord; - - chi.elem(rb[0].start()+qdp_ind).elem(spin).elem(col).real() = chi_even[ind][col][spin][0][x]; - chi.elem(rb[0].start()+qdp_ind).elem(spin).elem(col).imag() = chi_even[ind][col][spin][1][x]; - - chi.elem(rb[1].start()+qdp_ind).elem(spin).elem(col).real() = chi_odd[ind][col][spin][0][x]; - chi.elem(rb[1].start()+qdp_ind).elem(spin).elem(col).imag() = chi_odd[ind][col][spin][1][x]; - - } - } - } - } - } - } - } - } -#endif - #if defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE) // Downconvert an array of float-vecs to an array of float 16 vecs diff --git a/include/qphix/qdp_packer_parscalar.h b/include/qphix/qdp_packer_parscalar.h index caab9e1f..c53b5794 100644 --- a/include/qphix/qdp_packer_parscalar.h +++ b/include/qphix/qdp_packer_parscalar.h @@ -1,7 +1,7 @@ -#ifndef QPHIX_QDP_PACKER_H -#define QPHIX_QDP_PACKER_H - +#ifndef QPHIX_QDP_PACKER_PARSCALAR_H +#define QPHIX_QDP_PACKER_PARSCALAR_H +#warning "building with parscalar packers" #include "qdp.h" #include "qphix/geometry.h" @@ -181,62 +181,6 @@ namespace QPhiX { } - template - void qdp_pack_spinor(const QDPSpinor& psi_in, - typename Geometry::FourSpinorBlock* psi_even, - typename Geometry::FourSpinorBlock* psi_odd, - Geometry& s) - { - qdp_pack_cb_spinor(psi_in,psi_even,s,0); - qdp_pack_cb_spinor(psi_in,psi_odd,s,1); - } - -#if 0 - template - void qdp_pack_spinor(const QDPSpinor& psi_in, - typename Geometry::FourSpinorBlock* psi_even, - typename Geometry::FourSpinorBlock* psi_odd, - Geometry& s) - { - // Get the subgrid latt size. - int Nt = s.Nt(); - int Nz = s.Nz(); - int Ny = s.Ny(); - int Nxh = s.Nxh(); - int nvecs = s.nVecs(); - int Pxy = s.getPxy(); - int Pxyz = s.getPxyz(); - -#pragma omp parallel for collapse(4) - for(int t=0; t < Nt; t++) { - for(int z=0; z < Nz; z++) { - for(int y=0; y < Ny; y++) { - for(int s=0; s < nvecs; s++) { - for(int col=0; col < 3; col++) { - for(int spin=0; spin < 4; spin++) { - for(int x=0; x < soalen; x++) { - - int ind = t*Pxyz+z*Pxy+y*nvecs+s; //((t*Nz+z)*Ny+y)*nvecs+s; - int x_coord = s*soalen + x; - - int qdp_ind = ((t*Nz + z)*Ny + y)*Nxh + x_coord; - psi_even[ind][col][spin][0][x] = psi_in.elem(rb[0].start()+qdp_ind).elem(spin).elem(col).real(); - psi_even[ind][col][spin][1][x] = psi_in.elem(rb[0].start()+qdp_ind).elem(spin).elem(col).imag(); - - psi_odd[ind][col][spin][0][x] = psi_in.elem(rb[1].start()+qdp_ind).elem(spin).elem(col).real(); - psi_odd[ind][col][spin][1][x] = psi_in.elem(rb[1].start()+qdp_ind).elem(spin).elem(col).imag(); - - } - } - } - } - } - } - } - - } -#endif - template void qdp_unpack_cb_spinor(typename Geometry::FourSpinorBlock* chi_packed, @@ -279,299 +223,6 @@ namespace QPhiX { } } - template - void qdp_unpack_spinor(typename Geometry::FourSpinorBlock* chi_even, - typename Geometry::FourSpinorBlock* chi_odd, - QDPSpinor& chi, - Geometry& s) - { - qdp_unpack_cb_spinor(chi_even,chi,s,0); - qdp_unpack_cb_spinor(chi_odd,chi,s,1); - } - -#if 0 - template - void qdp_unpack_spinor(typename Geometry::FourSpinorBlock* chi_even, - typename Geometry::FourSpinorBlock* chi_odd, - QDPSpinor& chi, - Geometry& s) - { - - int Nt = s.Nt(); - int Nz = s.Nz(); - int Ny = s.Ny(); - int Nxh = s.Nxh(); - int nvecs = s.nVecs(); - int Pxy = s.getPxy(); - int Pxyz = s.getPxyz(); - - -#pragma omp parallel for collapse(4) - for(int t=0; t < Nt; t++) { - for(int z=0; z < Nz; z++) { - for(int y=0; y < Ny; y++) { - for(int s=0; s < nvecs; s++) { - for(int spin=0; spin < 4; spin++) { - for(int col=0; col < 3; col++) { - for(int x=0; x < soalen; x++) { - - int ind = t*Pxyz+z*Pxy+y*nvecs+s; //((t*Nz+z)*Ny+y)*nvecs+s; - int x_coord = s*soalen + x; - - int qdp_ind = ((t*Nz + z)*Ny + y)*Nxh + x_coord; - - chi.elem(rb[0].start()+qdp_ind).elem(spin).elem(col).real() = chi_even[ind][col][spin][0][x]; - chi.elem(rb[0].start()+qdp_ind).elem(spin).elem(col).imag() = chi_even[ind][col][spin][1][x]; - - chi.elem(rb[1].start()+qdp_ind).elem(spin).elem(col).real() = chi_odd[ind][col][spin][0][x]; - chi.elem(rb[1].start()+qdp_ind).elem(spin).elem(col).imag() = chi_odd[ind][col][spin][1][x]; - - } - } - } - } - } - } - } - } -#endif - -#if defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE) - - // Downconvert an array of float-vecs to an array of float 16 vecs - void downconvert_array(const float *from, half *to, const unsigned int nvecs) - { -#pragma omp parallel for shared(from,to) - for(int i=0; i < nvecs; i++) { - __m512 in = _mm512_load_ps((void *)(from+16*i)); - #if defined (QPHIX_MIC_SOURCE) - _mm512_extstore_ps((void *)(to+16*i),in,_MM_DOWNCONV_PS_FLOAT16,_MM_HINT_NT); - #elif defined(QPHIX_AVX512_SOURCE) - _mm256_store_si256((__m256i *)(to+16*i), _mm512_cvtps_ph(in, _MM_FROUND_TO_NEAREST_INT) ); - #endif - } - } - - // Upconvert an array of float16 vecs to an array of float 32 - void upconvert_array(const half* from, float *to, const unsigned int nvecs) - { -#pragma omp parallel for shared(from, to) - for(int i=0; i < nvecs; i++) { - #if defined (QPHIX_MIC_SOURCE) - __m512 in = _mm512_extload_ps((void *)(from+16*i),_MM_UPCONV_PS_FLOAT16, _MM_BROADCAST32_NONE, _MM_HINT_T0); - _mm512_storenrngo_ps((void *)(to+16*i),in); - #elif defined(QPHIX_AVX512_SOURCE) - __m512 in = _mm512_cvtph_ps(_mm256_load_si256((__m256i*)(from+16*i) ) ); - _mm512_stream_ps((void *)(to+16*i),in); - #endif - } - } - - - // Half precision packers... - template - void qdp_pack_gauge(const QDPGauge& u, - typename Geometry::SU3MatrixBlock *u_cb0, - typename Geometry::SU3MatrixBlock *u_cb1, - Geometry& s) - { - - typedef typename Geometry::SU3MatrixBlock GaugeF; - - int latt_size[4]; - int By,Bz,NCores,Sy,Sz,PadXY,PatXYZ,MinCt; - latt_size[0]=s.Nx(); - latt_size[1]=s.Ny(); - latt_size[2]=s.Nz(); - latt_size[3]=s.Nt(); - // Make a geometry for allocating - Geometry g_float(latt_size, - s.getBy(), - s.getBz(), - s.getNumCores(), - s.getSy(), - s.getSz(), - s.getPadXY(), - s.getPadXYZ(), - s.getMinCt()); - - GaugeF* tmp_cb0 = (GaugeF *)g_float.allocCBGauge(); - GaugeF* tmp_cb1 = (GaugeF *)g_float.allocCBGauge(); - // CHECK THESE ALLOCATIONS - - // OK Now pack the float - qdp_pack_gauge(u, tmp_cb0, tmp_cb1, g_float); - - // This is copied out of the allocation routine. - // Basically we take all that we have allocated - // and divide by veclen*sizeof(float) to get the number - // of vectors to downconvert - unsigned int n_f_vecs = (((s.getPxyz()*s.Nt()*soalen)/16)*sizeof(GaugeF))/(16*sizeof(float)); - - downconvert_array((float *)tmp_cb0, (half *)u_cb0, n_f_vecs); - downconvert_array((float *)tmp_cb1, (half *)u_cb1, n_f_vecs); - - g_float.free(tmp_cb0); - g_float.free(tmp_cb1); - - } - - template - void qdp_pack_spinor(const QDPSpinor& psi_in, - typename Geometry::FourSpinorBlock* psi_even, - typename Geometry::FourSpinorBlock* psi_odd, - Geometry& s) - { - typedef typename Geometry::FourSpinorBlock SpinorF; - - int latt_size[4]; - int By,Bz,NCores,Sy,Sz,PadXY,PatXYZ,MinCt; - latt_size[0]=s.Nx(); - latt_size[1]=s.Ny(); - latt_size[2]=s.Nz(); - latt_size[3]=s.Nt(); - // Make a geometry for allocating - Geometry g_float(latt_size, - s.getBy(), - s.getBz(), - s.getNumCores(), - s.getSy(), - s.getSz(), - s.getPadXY(), - s.getPadXYZ(), - s.getMinCt()); - - - SpinorF* tmp_cb0_alloc = (SpinorF *)g_float.allocCBFourSpinor(); - SpinorF* tmp_cb1_alloc = (SpinorF *)g_float.allocCBFourSpinor(); - - SpinorF* tmp_cb0 = tmp_cb0_alloc; - SpinorF* tmp_cb1 = tmp_cb1_alloc; - - // CHECK THESE ALLOCATIONS - - // OK Now pack the float - qdp_pack_spinor(psi_in, tmp_cb0, tmp_cb1, g_float); - - // This is copied out of the allocation routine. - // Basically we take all that we have allocated - // and divide by veclen*sizeof(float) to get the number - // of vectors to downconvert - - unsigned int n_f_vecs = ((s.getPxyz()*s.Nt())*sizeof(SpinorF))/(16*sizeof(float)); - - downconvert_array((float *)tmp_cb0, (half *)psi_even, n_f_vecs); - downconvert_array((float *)tmp_cb1, (half *)psi_odd, n_f_vecs); - - g_float.free(tmp_cb0_alloc); - g_float.free(tmp_cb1_alloc); - - } - - template - void qdp_unpack_spinor(typename Geometry::FourSpinorBlock* chi_even, - typename Geometry::FourSpinorBlock* chi_odd, - QDPSpinor& chi, - Geometry& s) - { - - typedef typename Geometry::FourSpinorBlock SpinorF; - - int latt_size[4]; - int By,Bz,NCores,Sy,Sz,PadXY,PatXYZ,MinCt; - latt_size[0]=s.Nx(); - latt_size[1]=s.Ny(); - latt_size[2]=s.Nz(); - latt_size[3]=s.Nt(); - // Make a geometry for allocating - Geometry g_float(latt_size, - s.getBy(), - s.getBz(), - s.getNumCores(), - s.getSy(), - s.getSz(), - s.getPadXY(), - s.getPadXYZ(), - s.getMinCt()); - - - // FIXME: CHECK THESE ALLOCATIONS - SpinorF* tmp_cb0_alloc = (SpinorF *)g_float.allocCBFourSpinor(); - SpinorF* tmp_cb1_alloc = (SpinorF *)g_float.allocCBFourSpinor(); - - SpinorF* tmp_cb0 = tmp_cb0_alloc; - SpinorF* tmp_cb1 = tmp_cb1_alloc; - - // This is copied out of the allocation routine. - // Basically we take all that we have allocated - // and divide by veclen*sizeof(float) to get the number - // of vectors to downconvert - - unsigned int n_f_vecs = ((s.getPxyz()*s.Nt())*sizeof(SpinorF))/(16*sizeof(float)); - - upconvert_array((half *)chi_even, (float *)tmp_cb0, n_f_vecs); - upconvert_array((half *)chi_odd, (float *)tmp_cb1, n_f_vecs); - - // OK Now pack the float - qdp_unpack_spinor(tmp_cb0, tmp_cb1,chi, g_float); - -//std::cout << "spinor up free in" << std::endl; - g_float.free(tmp_cb0_alloc); - g_float.free(tmp_cb1_alloc); -//std::cout << "spinor up free out" << std::endl; - - } - -#ifdef QPHIX_BUILD_CLOVER - template - void qdp_pack_clover(const ClovTerm& qdp_clov_in, - typename Geometry::CloverBlock* cl_out, - Geometry& s, int cb) - { - typedef typename Geometry::CloverBlock ClovF; - - int latt_size[4]; - int By,Bz,NCores,Sy,Sz,PadXY,PatXYZ,MinCt; - latt_size[0]=s.Nx(); - latt_size[1]=s.Ny(); - latt_size[2]=s.Nz(); - latt_size[3]=s.Nt(); - // Make a geometry for allocating - Geometry g_float(latt_size, - s.getBy(), - s.getBz(), - s.getNumCores(), - s.getSy(), - s.getSz(), - s.getPadXY(), - s.getPadXYZ(), - s.getMinCt()); - - - - - ClovF* tmp_clov = (ClovF *)g_float.allocCBClov(); - - // This is copied out of the allocation routine. - // Basically we take all that we have allocated - // and divide by veclen*sizeof(float) to get the number - // of vectors to downconvert - - unsigned int n_f_vecs = (( (s.getPxyz()*s.Nt()*soalen) / 16 )*sizeof(ClovF))/(16*sizeof(float)); - - - // OK Now pack the float - qdp_pack_clover(qdp_clov_in, tmp_clov, g_float, cb); - - downconvert_array((float *)tmp_clov,(half *)cl_out,n_f_vecs); -//std::cout << "clov down free in" << std::endl; - g_float.free(tmp_clov); -//std::cout << "clov down free out" << std::endl; - } -#endif // Build Clover - -#endif // if defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE) - }; diff --git a/include/qphix/qdp_packer_qdpjit.h b/include/qphix/qdp_packer_qdpjit.h new file mode 100644 index 00000000..3a812b7c --- /dev/null +++ b/include/qphix/qdp_packer_qdpjit.h @@ -0,0 +1,339 @@ +#ifndef QPHIX_QDP_PACKER_QDPJIT_H +#define QPHIX_QDP_PACKER_QDPJIT_H + +#warning "building with QDPJIT packers" + + +#include "qdp.h" +#include "qphix/geometry.h" + +#include "qphix/dslash_def.h" +#include "qphix/qphix_config.h" + +#if defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE) +#include +#endif + +#ifdef QPHIX_BUILD_CLOVER +#include "qphix/clover_dslash_def.h" +#endif + +using namespace QDP; + +namespace QPhiX { + + + template + void qdp_pack_gauge(const QDPGauge& u, + typename Geometry::SU3MatrixBlock *u_cb0, + typename Geometry::SU3MatrixBlock *u_cb1, + Geometry& s) + { + // Get the subgrid latt size. + int Nt = s.Nt(); + int Nz = s.Nz(); + int Ny = s.Ny(); + int nvecs = s.nVecs(); + int nyg = s.nGY(); + int Pxy = s.getPxy(); + int Pxyz = s.getPxyz(); + int qdp_inner_dim = (int)getDataLayoutInnerSize(); + const int n_complex_dim = 2; + + // Shift the lattice to get U(x-mu) + QDPGauge u_minus(4); + for(int mu=0; mu < 4; mu++) { + u_minus[mu] = shift(u[mu], BACKWARD, mu); + } + + // This ought to be the underlying precision type (float/double) in use + // by QDP-JIT and may well be different from FT + // So casting will be needed + using WT = typename WordType::Type_t; + +#pragma omp parallel for collapse(4) + for(int t = 0; t < Nt; t++) { + for(int z = 0; z < Nz; z++) { + for(int y = 0; y < Ny; y++) { + for(int s = 0; s < nvecs; s++) { + for(int mu = 0; mu < 4; mu++) { + + const WT* u_ptr = (const WT*)((u[mu]).getFjit()); + const WT* u_minus_ptr = (const WT*)((u_minus[mu]).getFjit()); + + int outer_c = 3; + if ( compress ) { + outer_c = 2; + } + for(int c = 0; c < outer_c; c++) { + for(int c2 = 0; c2 < 3; c2++) { + for(int x = 0; x < soalen; x++) { + + + int block = (t*Pxyz+z*Pxy)/nyg+(y/nyg)*nvecs+s; + int xx = (y%nyg)*soalen+x; + + // QDP-JIT in OCSRI layout: + // Outer x Color x Spin x Real x Inner + // + // However, it hardly matters maybe because + // There is no spin on the gauge fields + + int qdpsite = x + soalen*(s + nvecs*(y + Ny*(z + Nz*t))); + + int qdpsite_rb0 = qdpsite+rb[0].start(); + int qdp_outer_rb0 = qdpsite_rb0 / qdp_inner_dim; + int qdp_inner_rb0 = qdpsite_rb0 % qdp_inner_dim; + int rb0_offset = qdp_inner_rb0 + qdp_inner_dim*n_complex_dim*Nc*Nc*qdp_outer_rb0; + + u_cb0[block][2*mu][c][c2][RE][xx] = (FT)u_minus_ptr[rb0_offset + qdp_inner_dim*(RE+n_complex_dim*(c + Nc*c2)) ]; + u_cb0[block][2*mu][c][c2][IM][xx] = (FT)u_minus_ptr[rb0_offset + qdp_inner_dim*(IM+n_complex_dim*(c + Nc*c2)) ]; + u_cb0[block][2*mu+1][c][c2][RE][xx] = (FT)u_ptr[rb0_offset + qdp_inner_dim*(RE+n_complex_dim*(c + Nc*c2)) ]; + u_cb0[block][2*mu+1][c][c2][IM][xx] = (FT)u_ptr[rb0_offset + qdp_inner_dim*(IM+n_complex_dim*(c + Nc*c2)) ]; + + int qdpsite_rb1 = qdpsite+rb[1].start(); + int qdp_outer_rb1 = qdpsite_rb1 / qdp_inner_dim; + int qdp_inner_rb1 = qdpsite_rb1 % qdp_inner_dim; + int rb1_offset = qdp_inner_rb1 + qdp_inner_dim*n_complex_dim*Nc*Nc*qdp_outer_rb1; + + u_cb1[block][2*mu][c][c2][RE][xx] = (FT)u_minus_ptr[rb1_offset + qdp_inner_dim*(RE+n_complex_dim*(c + Nc*c2)) ]; + u_cb1[block][2*mu][c][c2][IM][xx] = (FT)u_minus_ptr[rb1_offset + qdp_inner_dim*(IM+n_complex_dim*(c + Nc*c2)) ]; + u_cb1[block][2*mu+1][c][c2][RE][xx] = (FT)u_ptr[rb1_offset + qdp_inner_dim*(RE+n_complex_dim*(c + Nc*c2)) ]; + u_cb1[block][2*mu+1][c][c2][IM][xx] = (FT)u_ptr[rb1_offset + qdp_inner_dim*(IM+n_complex_dim*(c + Nc*c2)) ]; + + } + } + } + } + } + } + } + } + + + } + +#ifdef QPHIX_BUILD_CLOVER + + // This accesses the Internals of the LLVMCloverTerm + template + void qdp_pack_clover(const ClovTerm& qdp_clov_in, + typename ClovDslash::CloverBlock* cl_out,Geometry& s, int cb) + { + // Get the subgrid latt size. + int Nt = s.Nt(); + int Nz = s.Nz(); + int Ny = s.Ny(); + int nvecs = s.nVecs(); + int nyg = s.nGY(); + int Pxy = s.getPxy(); + int Pxyz = s.getPxyz(); + + // Sanity Check + // QDP Type is + // Outer x 2 chiral blocks x 6 floats x inner sites + // JIT-LLVM CLOVER will need to expose DiagType and OffDiagType + // It does in the testcase here but make sure chroma version does it too. + + using DiagType = typename ClovTerm::DiagType; + using OffDiagType = typename ClovTerm::OffDiagType; + + // This is the base type used here. Either double or float + using WT = typename WordType::Type_t; + + const DiagType& diag_term = qdp_clov_in.getDiagBuffer(); + const OffDiagType& off_diag_term = qdp_clov_in.getOffDiagBuffer(); + + const WT* diag_buf = (const WT*)diag_term.getFjit(); + const WT* off_diag_buf = (const WT*)off_diag_term.getFjit(); + + int qdp_inner_dim = (int)getDataLayoutInnerSize(); + const int n_complex_dim = 2; + const int diag_dim = 6; + const int offdiag_dim = 15; + const int chiral_dim = 2; + const int diag_offset = qdp_inner_dim*diag_dim*chiral_dim; + const int offdiag_offset = qdp_inner_dim*offdiag_dim*n_complex_dim*chiral_dim; + + // No elem calls in parallel region +#pragma omp parallel for collapse(4) + for(int t = 0; t < Nt; t++) { + for(int z = 0; z < Nz; z++) { + for(int y = 0; y < Ny; y++) { + for(int s = 0; s < nvecs; s++) { + for(int x = 0; x < soalen; x++) { + + int block = (t*Pxyz+z*Pxy)/nyg+(y/nyg)*nvecs+s; + int xx = (y%nyg)*soalen+x; + + int qdpsite = x + soalen*(s + nvecs*(y + Ny*(z + Nz*t)))+rb[cb].start(); + int qdp_outer_idx = qdpsite / qdp_inner_dim; + int qdp_inner_idx = qdpsite % qdp_inner_dim; + + const WT* diag_base = &diag_buf[diag_offset*qdp_outer_idx]; + const WT* offdiag_base = &off_diag_buf[offdiag_offset*qdp_outer_idx]; + + // WARNING: THIS WORKS ONLY IN OCSRI Layout in QDP_JIT + // + // For OSCRI Layout: chiral component and diagonal must be transposed. + // chiral component and off diagonal must also be transposed + + // Logical Outer x Spin x Color x Inner => Outer x Comp x Diag x Inner + // But we are using OCSRI => Outer Diag Comp Inner + + for(int d=0; d < 6; d++) { + cl_out[block].diag1[d][xx] = (FT)(diag_base[ qdp_inner_idx + qdp_inner_dim*(0 + chiral_dim*d ) ]); + } + + for(int od=0; od < 15; od++) { + + cl_out[block].off_diag1[od][RE][xx] = (FT)(offdiag_base[ qdp_inner_idx + + qdp_inner_dim*(RE + n_complex_dim*(0 + chiral_dim*od))]); + + cl_out[block].off_diag1[od][IM][xx] = (FT)(offdiag_base[ qdp_inner_idx + + qdp_inner_dim*(IM + n_complex_dim*(0 + chiral_dim*od))]); + + } + + for(int d=0; d < 6; d++) { + cl_out[block].diag2[d][xx] = (FT)(diag_base[ qdp_inner_idx + qdp_inner_dim*(1 + chiral_dim*d ) ]); + } + + for(int od=0; od < 15; od++) { + cl_out[block].off_diag2[od][RE][xx] = (FT)(offdiag_base[ qdp_inner_idx + + qdp_inner_dim*(RE + n_complex_dim*(1 + chiral_dim*od))]); + + cl_out[block].off_diag2[od][IM][xx] = (FT)(offdiag_base[ qdp_inner_idx + + qdp_inner_dim*(IM + n_complex_dim*(1 + chiral_dim*od))]); + + } + } + } + } + } + } + } + + + +#endif // IFDEF BUILD CLOVER + + + template + void qdp_pack_cb_spinor(const QDPSpinor& psi_in, + typename Geometry::FourSpinorBlock* psi, + Geometry& s, + int cb) + { + // Get the subgrid latt size. + int Nt = s.Nt(); + int Nz = s.Nz(); + int Ny = s.Ny(); + int Nxh = s.Nxh(); + int nvecs = s.nVecs(); + int Pxy = s.getPxy(); + int Pxyz = s.getPxyz(); + + int qdp_inner_dim = (int)getDataLayoutInnerSize(); + const int n_complex_dim = 2; + using WT =typename WordType::Type_t; + + const int outer_block_size = qdp_inner_dim*n_complex_dim*Nc*Ns; + +#pragma omp parallel for collapse(4) + for(int t=0; t < Nt; t++) { + for(int z=0; z < Nz; z++) { + for(int y=0; y < Ny; y++) { + for(int s=0; s < nvecs; s++) { + for(int col=0; col < 3; col++) { + for(int spin=0; spin < 4; spin++) { + for(int x=0; x < soalen; x++) { + + int ind = t*Pxyz+z*Pxy+y*nvecs+s; //((t*Nz+z)*Ny+y)*nvecs+s; + int x_coord = s*soalen + x; + + int qdp_index = ((t*Nz + z)*Ny + y)*Nxh + x_coord + rb[cb].start(); + int qdp_inner_idx = qdp_index % qdp_inner_dim; + int qdp_outer_idx = qdp_index / qdp_inner_dim; + + WT* psiptr=(WT *)(psi_in.getFjit()); + + // ASSUME OCSRI order. So the fixed points our + // qdp_outer*block-size + qdp_inner + // and the iteration is over the spinor as Inner x Complex x Spin x Color + + int offset = qdp_inner_idx + outer_block_size*qdp_outer_idx; + psi[ind][col][spin][0][x] = (FT)psiptr[offset+qdp_inner_dim*(RE + n_complex_dim*(spin + Ns*col)) ]; + + psi[ind][col][spin][1][x] = (FT)psiptr[offset+qdp_inner_dim*(IM + n_complex_dim*(spin + Ns*col)) ]; + } + } + } + } + } + } + } + + } + + + template + void qdp_unpack_cb_spinor(typename Geometry::FourSpinorBlock* chi_packed, + QDPSpinor& chi, + Geometry& s, + int cb) + { + int Nt = s.Nt(); + int Nz = s.Nz(); + int Ny = s.Ny(); + int Nxh = s.Nxh(); + int nvecs = s.nVecs(); + int Pxy = s.getPxy(); + int Pxyz = s.getPxyz(); + + int qdp_inner_dim = (int)getDataLayoutInnerSize(); + const int n_complex_dim = 2; + using WT =typename WordType::Type_t; + + const int outer_block_size = qdp_inner_dim*n_complex_dim*Nc*Ns; + + +#pragma omp parallel for collapse(4) + for(int t=0; t < Nt; t++) { + for(int z=0; z < Nz; z++) { + for(int y=0; y < Ny; y++) { + for(int s=0; s < nvecs; s++) { + for(int spin=0; spin < 4; spin++) { + for(int col=0; col < 3; col++) { + for(int x=0; x < soalen; x++) { + + int ind = t*Pxyz+z*Pxy+y*nvecs+s; //((t*Nz+z)*Ny+y)*nvecs+s; + int x_coord = s*soalen + x; + + int qdp_index = ((t*Nz + z)*Ny + y)*Nxh + x_coord + rb[cb].start(); + int qdp_inner_idx = qdp_index % qdp_inner_dim; + int qdp_outer_idx = qdp_index / qdp_inner_dim; + + WT* chiptr=(WT *)(chi.getFjit()); + // ASSUME OCSRI order. So the fixed points our + // qdp_outer*block-size + qdp_inner + // and the iteration is over the spinor as Inner x Complex x Spin x Color + + int offset = qdp_inner_idx + outer_block_size*qdp_outer_idx; + + chiptr[offset+qdp_inner_dim*(RE + n_complex_dim*(spin + Ns*col)) ] = (WT)chi_packed[ind][col][spin][0][x]; + chiptr[offset+qdp_inner_dim*(IM + n_complex_dim*(spin + Ns*col)) ] = (WT)chi_packed[ind][col][spin][1][x]; + + } + } + } + } + } + } + } + } + +}; + + +#endif diff --git a/tests/testClovDslashFull.cc b/tests/testClovDslashFull.cc index ac200351..ce0c7475 100644 --- a/tests/testClovDslashFull.cc +++ b/tests/testClovDslashFull.cc @@ -265,7 +265,7 @@ testClovDslashFull::runTest(void) QDPIO::cout << " clparam.clovCoeffR = " << clparam.clovCoeffR << std::endl; QDPIO::cout << " clparam.clovCoeffT = " << clparam.clovCoeffT << std::endl; clov_qdp.create(u, clparam); - clov_qdp.printDiag(); + // clov_qdp.printDiag(); QDPIO::cout << "Copying into InvClover Term" << endl; CloverTermT invclov_qdp(clov_qdp); @@ -404,6 +404,7 @@ testClovDslashFull::runTest(void) int target_cb = cb; clov_chi = zero; + chi = zero; qdp_pack_spinor<>(chi, chi_even, chi_odd, geom); @@ -475,7 +476,7 @@ testClovDslashFull::runTest(void) -#if 0 +#if 1 // Go through the test cases -- apply SSE dslash versus, QDP Dslash // Test ax - bDslash y QDPIO::cout << "Testing dslashAchiMinusBDPsi" << endl; @@ -538,7 +539,7 @@ testClovDslashFull::runTest(void) #endif -#if 0 +#if 1 // Test only Dslash operator. // For clover this will be: A^{-1}_(1-cb,1-cb) D_(1-cb, cb) psi_cb QDPIO::cout << "Testing Dslash With antiperiodic BCs \n" << endl; @@ -671,7 +672,7 @@ testClovDslashFull::runTest(void) -#if 0 +#if 1 // Go through the test cases -- apply SSE dslash versus, QDP Dslash // Test ax - bDslash y QDPIO::cout << "Testing dslashAchiMinusBDPsi" << endl; @@ -734,7 +735,7 @@ testClovDslashFull::runTest(void) #endif // Disabling testing the even odd operator until recoded with new vectorization -#if 0 +#if 1 QDPIO::cout << "Testing Even Odd Operator" << endl; t_boundary=(double)(-1); EvenOddCloverOperator M(u_packed, @@ -801,7 +802,7 @@ testClovDslashFull::runTest(void) #endif -#if 0 +#if 1 { chi = zero; qdp_pack_cb_spinor<>(chi, chi_s[1], geom,1); @@ -859,7 +860,7 @@ testClovDslashFull::runTest(void) } #endif -#if 0 +#if 1 { chi = zero; qdp_pack_spinor<>(chi, chi_even, chi_odd, geom);