From 73947871e8dcd11dc31af725dfa8cbb88527427e Mon Sep 17 00:00:00 2001
From: Balint Joo <bjoo@jlab.org>
Date: Fri, 24 Jun 2016 22:55:54 -0400
Subject: [PATCH] Initial integration with QDP-JIT  - packers are fully
 threaded and use raw QDPJIT pointer access.  - packers work only when QDP-JIT
 is run with -layout ocsri

Todo:
 - recheck integration has not broken non QDP-JIT build
 - bulletproof (with assertions) to ensure layout is ocsri, or adapta also to oscri
 - Use as 'native' dslash: only possible in ocsri mode since that is what is wired into QDPJIT
 - lots of testing.
        - test full chroma QDP-JIT build (e.g. Leapfrog)
---
 NEWS                                 |   1 +
 TODO                                 |   9 +-
 include/qphix/Makefile.am            |   2 +-
 include/qphix/qdp_packer.h           | 398 +--------------------------
 include/qphix/qdp_packer_parscalar.h | 355 +-----------------------
 include/qphix/qdp_packer_qdpjit.h    | 339 +++++++++++++++++++++++
 tests/testClovDslashFull.cc          |  15 +-
 7 files changed, 362 insertions(+), 757 deletions(-)
 create mode 100644 include/qphix/qdp_packer_qdpjit.h
diff --git a/NEWS b/NEWS
index 91dc768f..9a121876 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,4 @@
+6/24/2015 (Balint Joo/Thorsten Kurth) - Added lots of cleanup for KNL. Added packers for QDP-JIT (OCSRI layout for now)
 1/12/2016 (Balint Joo/Aaron Walden) - Added AVX512 output from code generator. Tested on Intel SDE. Single and Double precision work, 16-bit precision is currently broken.
 
 4/9/2015 (Balint Joo): Added Basic Multi-Shift solvers -- but can still add more optimizations. Need to test multi-node etc.
diff --git a/TODO b/TODO
index 9e560720..0cad6154 100644
--- a/TODO
+++ b/TODO
@@ -1,10 +1,13 @@
+To do with QDPJIT:
+  - Thorough testing for SOA combinations (initially done with 4)
+  - Bullet proofing assertions that data is in OCSRI layout or make the code generically figure out the right layout.
+  - Write native dslash using raw pointers: need OSCRI layout assertion and InnerLength assertion (easy)
+    - but issue w.r.t currently onle inner=8 from QDPJIT apparently working in my tests
+
 TO Do:
 - Add cache blocking in X -- hooks are already present in the kernels, but need to adapt loops.
-- Add other solvers specially multi-shift
 - Solve N-systems at once (ongoing PhD project at Old Dominion University Computer Science Department: ODU-JLab collaboration)
 - Clean code for non ICC compilers (work ongoing by Diptorup Deb, Renaissance Computing Institute, University of North Carolina, Chapel Hill
-- Add other processor targets: SSE, AVX2, AVX512 - codegen supports these already but need to be added
-- Add other fermions: e.g. Twisted Mass, HiSQ, Domain Wall?
 - Bullet proof a little (A LOT!!!)
 - Better tuning (implement 1/2/3/4 threads per core instead of just all 4)
 - reduce verbosite and use master printf to better effect (log levels?)
diff --git a/include/qphix/Makefile.am b/include/qphix/Makefile.am
index 296b7ad4..c64432f9 100644
--- a/include/qphix/Makefile.am
+++ b/include/qphix/Makefile.am
@@ -39,7 +39,7 @@ qphix_include_HEADERS += \
 #
 
 if QPHIX_BUILD_WITH_QDP
-qphix_include_HEADERS += ./qdp_packer.h
+qphix_include_HEADERS += ./qdp_packer.h ./qdp_packer_parscalar.h qdp_packer_qdpjit.h
 endif
 
 
diff --git a/include/qphix/qdp_packer.h b/include/qphix/qdp_packer.h
index 4dfc9ee7..47615404 100644
--- a/include/qphix/qdp_packer.h
+++ b/include/qphix/qdp_packer.h
@@ -9,12 +9,10 @@
 #include "qphix/dslash_def.h"
 #include "qphix/qphix_config.h"
 
-#if defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE)
-#include <immintrin.h>
-#endif
-
-#ifdef QPHIX_BUILD_CLOVER
-#include "qphix/clover_dslash_def.h"
+#ifdef QPHIX_BUILD_QDPJIT
+#include "qphix/qdp_packer_qdpjit.h"
+#else
+#include "qphix/qdp_packer_parscalar.h"
 #endif
 
 using namespace QDP;
@@ -22,259 +20,6 @@ using namespace QDP;
 namespace QPhiX { 
 
 
-  template<typename FT, int veclen, int soalen, bool compress, typename QDPGauge>
-    void qdp_pack_gauge(const QDPGauge& u,  
-      typename Geometry<FT,veclen,soalen,compress>::SU3MatrixBlock *u_cb0, 
-      typename Geometry<FT,veclen,soalen,compress>::SU3MatrixBlock *u_cb1, 
-      Geometry<FT,veclen,soalen,compress>& s)
-  {
-    // Get the subgrid latt size.
-    int Nt = s.Nt();
-    int Nz = s.Nz();
-    int Ny = s.Ny();
-    int nvecs = s.nVecs();
-    int nyg = s.nGY();
-    int Pxy = s.getPxy();
-    int Pxyz = s.getPxyz();
-    
-
-    // Shift the lattice to get U(x-mu)
-    QDPGauge u_minus(4);
-    for(int mu=0; mu < 4; mu++) {
-      u_minus[mu] = shift(u[mu], BACKWARD, mu);
-    }
-
-    std::cout << "QDP Shift is complete. Entering OpenMP Pack Region" << std::endl;
-
-// No Elem calls in parallel regions
-// #pragma omp parallel for collapse(4)
-		for(int t = 0; t < Nt; t++) {
-			for(int z = 0; z < Nz; z++) {
-				for(int y = 0; y < Ny; y++) {
-					for(int s = 0; s < nvecs; s++) {
-						for(int mu = 0; mu < 4; mu++) {
-							int outer_c = 3;
-							if ( compress ) {
-								outer_c = 2;
-							}
-							for(int c = 0; c < outer_c; c++) {
-								for(int c2 = 0; c2 < 3; c2++) {
-									for(int x = 0; x < soalen; x++) {
-
-										//#ifndef USE_PACKED_GAUGES
-										//int xx = x;
-										//int block = ((t*Nz+z)*Ny+y)*nvecs+s;
-
-										//#endif
-										//#else // USE_PACKED_GAUGES
-										int block = (t*Pxyz+z*Pxy)/nyg+(y/nyg)*nvecs+s;
-										int xx = (y%nyg)*soalen+x;
-										// #endif // USE_PACKED_GAUGES
-
-										int qdpsite = x + soalen*(s + nvecs*(y + Ny*(z + Nz*t)));
-										u_cb0[block][2*mu][c][c2][0][xx] = u_minus[mu].elem(rb[0].start() + qdpsite).elem().elem(c2,c).real();
-										u_cb0[block][2*mu][c][c2][1][xx] = u_minus[mu].elem(rb[0].start() + qdpsite).elem().elem(c2,c).imag();
-										u_cb0[block][2*mu+1][c][c2][0][xx] = u[mu].elem(rb[0].start() + qdpsite).elem().elem(c2,c).real();
-										u_cb0[block][2*mu+1][c][c2][1][xx] = u[mu].elem(rb[0].start() + qdpsite).elem().elem(c2,c).imag();
-
-										u_cb1[block][2*mu][c][c2][0][xx] = u_minus[mu].elem(rb[1].start() + qdpsite).elem().elem(c2,c).real();
-										u_cb1[block][2*mu][c][c2][1][xx] = u_minus[mu].elem(rb[1].start() + qdpsite).elem().elem(c2,c).imag();
-										u_cb1[block][2*mu+1][c][c2][0][xx] = u[mu].elem(rb[1].start() + qdpsite).elem().elem(c2,c).real();
-										u_cb1[block][2*mu+1][c][c2][1][xx] = u[mu].elem(rb[1].start() + qdpsite).elem().elem(c2,c).imag();
-									}
-								}
-							}
-						}
-					}
-				}
-			}
-		}
-
-		QDPIO::cout << "Leaving Gauge PAck" << std::endl;
-	}
-
-#ifdef QPHIX_BUILD_CLOVER
-
-#ifdef QPHIX_BUILD_QDPJIT
-  // extern int64_t getDataLayoutInnerSize();
-
-  // This accesses the Internals of the LLVMCloverTerm
-  template<typename FT, int veclen, int soalen, bool compress, typename ClovTerm>
-     void qdp_pack_clover(const ClovTerm& qdp_clov_in,
-        typename ClovDslash<FT,veclen,soalen,compress>::CloverBlock* cl_out,Geometry<FT,veclen,soalen,compress>& s, int cb)
-   {
-     // Get the subgrid latt size.
-     int Nt = s.Nt();
-     int Nz = s.Nz();
-     int Ny = s.Ny();
-     int nvecs = s.nVecs();
-     int nyg = s.nGY();
-     int Pxy = s.getPxy();
-     int Pxyz = s.getPxyz();
-
-     // Sanity Check
-     // QDP Type is
-     // Outer x 2 chiral blocks x 6 floats x inner sites
-     const typename ClovTerm::DiagType& diag_buf = qdp_clov_in.getDiagBuffer();
-     const typename ClovTerm::OffDiagType& off_diag_buf = qdp_clov_in.getOffDiagBuffer();
-
-   //  const int qdp_inner_size =(int)getDataLayoutInnerSize();
-   //  const int num_comp = 2;
-   //  const int num_complex = 2;
-
-
-
-
- // No elem calls in parallel region
- //#pragma omp parallel for collapse(4)
-     for(int t = 0; t < Nt; t++) {
-       for(int z = 0; z < Nz; z++) {
-   for(int y = 0; y < Ny; y++) {
-     for(int s = 0; s < nvecs; s++) {
-       for(int x = 0; x < soalen; x++) {
-
-         int block = (t*Pxyz+z*Pxy)/nyg+(y/nyg)*nvecs+s;
-         int xx = (y%nyg)*soalen+x;
-
-         int qdpsite = x + soalen*(s + nvecs*(y + Ny*(z + Nz*t)))+rb[cb].start();
-        // int qdp_osite = qdpsite/qdp_inner_size;
-        // int qdp_isite = qdpsite%qdp_inner_size;
-
-
-         for(int d=0; d < 6; d++) {
-        	cl_out[block].diag1[d][xx] = (FT)diag_buf.elem(qdpsite).comp[0].diag[d].elem().elem();
-        //  cl_out[block].diag1[d][xx] = (FT)1/(FT)4.1;
-         }
-         for(int od=0; od < 15; od++) {
-        	cl_out[block].off_diag1[od][RE][xx] = off_diag_buf.elem(qdpsite).comp[0].offd[od].real().elem();
-        	cl_out[block].off_diag1[od][IM][xx] = off_diag_buf.elem(qdpsite).comp[0].offd[od].imag().elem();
-
-        	// cl_out[block].off_diag1[od][RE][xx] = 0;
-            //  cl_out[block].off_diag1[od][IM][xx] = 0;
-
-
-         }
-
-         for(int d=0; d < 6; d++) {
-        	 cl_out[block].diag2[d][xx] = diag_buf.elem(qdpsite).comp[1].diag[d].elem().elem();
-     //   	 cl_out[block].diag2[d][xx] = (FT)1/(FT)4.1;
-         }
-         for(int od=0; od < 15; od++) {
-
-         	 cl_out[block].off_diag2[od][RE][xx] = off_diag_buf.elem(qdpsite).comp[1].offd[od].real().elem();
-         	 cl_out[block].off_diag2[od][IM][xx] = off_diag_buf.elem(qdpsite).comp[1].offd[od].imag().elem();
-
-       //       cl_out[block].off_diag2[od][RE][xx] = 0;
-       //       cl_out[block].off_diag2[od][IM][xx] = 0;
-
-
-         }
-       }
-     }
-   }
-       }
-     }
-   }
-
-
-#else
-
-  template<typename FT, int veclen, int soalen, bool compress, typename ClovTerm>
-    void qdp_pack_clover(const ClovTerm& clov_in,
-       typename ClovDslash<FT,veclen,soalen,compress>::CloverBlock* cl_out,Geometry<FT,veclen,soalen,compress>& s, int cb)
-  {
-    // Get the subgrid latt size.
-    int Nt = s.Nt();
-    int Nz = s.Nz();
-    int Ny = s.Ny();
-    int nvecs = s.nVecs();
-    int nyg = s.nGY();
-    int Pxy = s.getPxy();
-    int Pxyz = s.getPxyz();
-    
-    auto qdp_clov_in = clov_in.getTriBuffer();
-
-
-#pragma omp parallel for collapse(4)
-    for(int t = 0; t < Nt; t++) {
-      for(int z = 0; z < Nz; z++) {
-  for(int y = 0; y < Ny; y++) {
-    for(int s = 0; s < nvecs; s++) {
-      for(int x = 0; x < soalen; x++) {
-        
-        int block = (t*Pxyz+z*Pxy)/nyg+(y/nyg)*nvecs+s;
-        int xx = (y%nyg)*soalen+x;
-        int qdpsite = x + soalen*(s + nvecs*(y + Ny*(z + Nz*t)))+rb[cb].start();
-        
-        for(int d=0; d < 6; d++) { 
-        	cl_out[block].diag1[d][xx]=qdp_clov_in[qdpsite].diag[0][d].elem();
-        }
-        for(int od=0; od < 15; od++) { 
-        	cl_out[block].off_diag1[od][RE][xx]=qdp_clov_in[qdpsite].offd[0][od].real();
-        	cl_out[block].off_diag1[od][IM][xx]=qdp_clov_in[qdpsite].offd[0][od].imag();
-        }
-
-        for(int d=0; d < 6; d++) { 
-        	cl_out[block].diag2[d][xx]=qdp_clov_in[qdpsite].diag[1][d].elem();
-
-        }
-        for(int od=0; od < 15; od++) { 
-        	cl_out[block].off_diag2[od][RE][xx]=qdp_clov_in[qdpsite].offd[1][od].real();
-        	cl_out[block].off_diag2[od][IM][xx]=qdp_clov_in[qdpsite].offd[1][od].imag();
-        }
-      }
-    }
-  }
-      }
-    }
-  }
-  
-#endif
-#endif  // IFDEF BUILD CLOVER
-
-
-  template<typename FT, int veclen, int soalen, bool compress, typename QDPSpinor>
-  void qdp_pack_cb_spinor(const QDPSpinor& psi_in, 
-        typename Geometry<FT,veclen,soalen, compress>::FourSpinorBlock* psi,
-        Geometry<FT,veclen,soalen,compress>& s,
-        int cb) 
-  { 
-    // Get the subgrid latt size.
-    int Nt = s.Nt();
-    int Nz = s.Nz();
-    int Ny = s.Ny();
-    int Nxh = s.Nxh();
-    int nvecs = s.nVecs();
-    int Pxy = s.getPxy();
-    int Pxyz = s.getPxyz();
-
-    // No elem in OpenMP parallel region
-//#pragma omp parallel for collapse(4)
-      for(int t=0; t < Nt; t++) {
-  for(int z=0; z < Nz; z++) {
-    for(int y=0; y < Ny; y++) {
-      for(int s=0; s < nvecs; s++) { 
-        for(int col=0; col < 3; col++)  {
-    for(int spin=0; spin < 4; spin++) { 
-      for(int x=0; x < soalen; x++) { 
-
-        int ind = t*Pxyz+z*Pxy+y*nvecs+s; //((t*Nz+z)*Ny+y)*nvecs+s;
-        int x_coord = s*soalen + x;
-        
-        int qdp_ind = ((t*Nz + z)*Ny + y)*Nxh + x_coord;
-        psi[ind][col][spin][0][x] = psi_in.elem(rb[cb].start()+qdp_ind).elem(spin).elem(col).real();
-        psi[ind][col][spin][1][x] = psi_in.elem(rb[cb].start()+qdp_ind).elem(spin).elem(col).imag();
-      
-      }
-    }
-        }
-      }
-    }
-  }
-      }
-      
-  }
-
   template<typename FT, int veclen, int soalen, bool compress, typename QDPSpinor>
   void qdp_pack_spinor(const QDPSpinor& psi_in, 
            typename Geometry<FT,veclen,soalen, compress>::FourSpinorBlock* psi_even, 
@@ -285,94 +30,6 @@ namespace QPhiX {
     qdp_pack_cb_spinor(psi_in,psi_odd,s,1);
   }
 
-#if 0
-  template<typename FT, int veclen, int soalen, bool compress, typename QDPSpinor>
-  void qdp_pack_spinor(const QDPSpinor& psi_in, 
-           typename Geometry<FT,veclen,soalen, compress>::FourSpinorBlock* psi_even, 
-           typename Geometry<FT,veclen,soalen, compress>::FourSpinorBlock* psi_odd,
-           Geometry<FT,veclen,soalen,compress>& s) 
-  { 
-    // Get the subgrid latt size.
-    int Nt = s.Nt();
-    int Nz = s.Nz();
-    int Ny = s.Ny();
-    int Nxh = s.Nxh();
-    int nvecs = s.nVecs();
-    int Pxy = s.getPxy();
-    int Pxyz = s.getPxyz();
-
-#pragma omp parallel for collapse(4)
-      for(int t=0; t < Nt; t++) {
-  for(int z=0; z < Nz; z++) {
-    for(int y=0; y < Ny; y++) {
-      for(int s=0; s < nvecs; s++) { 
-        for(int col=0; col < 3; col++)  {
-    for(int spin=0; spin < 4; spin++) { 
-      for(int x=0; x < soalen; x++) { 
-
-        int ind = t*Pxyz+z*Pxy+y*nvecs+s; //((t*Nz+z)*Ny+y)*nvecs+s;
-        int x_coord = s*soalen + x;
-        
-        int qdp_ind = ((t*Nz + z)*Ny + y)*Nxh + x_coord;
-        psi_even[ind][col][spin][0][x] = psi_in.elem(rb[0].start()+qdp_ind).elem(spin).elem(col).real();
-        psi_even[ind][col][spin][1][x] = psi_in.elem(rb[0].start()+qdp_ind).elem(spin).elem(col).imag();
-        
-        psi_odd[ind][col][spin][0][x] = psi_in.elem(rb[1].start()+qdp_ind).elem(spin).elem(col).real();
-        psi_odd[ind][col][spin][1][x] = psi_in.elem(rb[1].start()+qdp_ind).elem(spin).elem(col).imag();
-      
-      }
-    }
-        }
-      }
-    }
-  }
-      }
-      
-  }
-#endif
-  
-
-  template<typename FT, int veclen, int soalen, bool compress, typename QDPSpinor>
-    void qdp_unpack_cb_spinor(typename Geometry<FT,veclen,soalen,compress>::FourSpinorBlock* chi_packed, 
-            QDPSpinor& chi,
-            Geometry<FT,veclen,soalen,compress>& s,
-            int cb) 
-  { 
-    int Nt = s.Nt();
-    int Nz = s.Nz();
-    int Ny = s.Ny();
-    int Nxh = s.Nxh();
-    int nvecs = s.nVecs();
-    int Pxy = s.getPxy();
-    int Pxyz = s.getPxyz();
-
-
-    // No elem() in OpenMP parallel
-//#pragma omp parallel for collapse(4)
-    for(int t=0; t < Nt; t++) {
-      for(int z=0; z < Nz; z++) {
-  for(int y=0; y < Ny; y++) {
-    for(int s=0; s < nvecs; s++) { 
-      for(int spin=0; spin < 4; spin++) { 
-        for(int col=0; col < 3; col++)  {
-    for(int x=0; x < soalen; x++) { 
-
-      int ind = t*Pxyz+z*Pxy+y*nvecs+s; //((t*Nz+z)*Ny+y)*nvecs+s;
-      int x_coord = s*soalen + x;
-      
-      int qdp_ind = ((t*Nz + z)*Ny + y)*Nxh + x_coord;
-      
-      chi.elem(rb[cb].start()+qdp_ind).elem(spin).elem(col).real() =  chi_packed[ind][col][spin][0][x];
-      chi.elem(rb[cb].start()+qdp_ind).elem(spin).elem(col).imag() =  chi_packed[ind][col][spin][1][x];
-
-    }
-        }
-      }
-    }
-  }
-      }
-    }
-  }
 
   template<typename FT, int veclen, int soalen, bool compress, typename QDPSpinor>
     void qdp_unpack_spinor(typename Geometry<FT,veclen,soalen,compress>::FourSpinorBlock* chi_even, 
@@ -384,53 +41,6 @@ namespace QPhiX {
     qdp_unpack_cb_spinor(chi_odd,chi,s,1);
   }
 
-#if 0
-  template<typename FT, int veclen, int soalen, bool compress, typename QDPSpinor>
-    void qdp_unpack_spinor(typename Geometry<FT,veclen,soalen,compress>::FourSpinorBlock* chi_even, 
-         typename Geometry<FT,veclen,soalen,compress>::FourSpinorBlock* chi_odd,
-         QDPSpinor& chi,
-         Geometry<FT,veclen,soalen,compress>& s) 
-  { 
-
-    int Nt = s.Nt();
-    int Nz = s.Nz();
-    int Ny = s.Ny();
-    int Nxh = s.Nxh();
-    int nvecs = s.nVecs();
-    int Pxy = s.getPxy();
-    int Pxyz = s.getPxyz();
-
-
-#pragma omp parallel for collapse(4)    
-    for(int t=0; t < Nt; t++) {
-      for(int z=0; z < Nz; z++) {
-  for(int y=0; y < Ny; y++) {
-    for(int s=0; s < nvecs; s++) { 
-      for(int spin=0; spin < 4; spin++) { 
-        for(int col=0; col < 3; col++)  {
-    for(int x=0; x < soalen; x++) { 
-
-      int ind = t*Pxyz+z*Pxy+y*nvecs+s; //((t*Nz+z)*Ny+y)*nvecs+s;
-      int x_coord = s*soalen + x;
-      
-      int qdp_ind = ((t*Nz + z)*Ny + y)*Nxh + x_coord;
-      
-      chi.elem(rb[0].start()+qdp_ind).elem(spin).elem(col).real() =  chi_even[ind][col][spin][0][x];
-      chi.elem(rb[0].start()+qdp_ind).elem(spin).elem(col).imag() =  chi_even[ind][col][spin][1][x];
-
-      chi.elem(rb[1].start()+qdp_ind).elem(spin).elem(col).real() =  chi_odd[ind][col][spin][0][x];
-      chi.elem(rb[1].start()+qdp_ind).elem(spin).elem(col).imag() =  chi_odd[ind][col][spin][1][x];
-
-    }
-        }
-      }
-    }
-  }
-      }
-    }
-  }
-#endif
-
 #if defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE)
 
   // Downconvert an array of float-vecs to an array of float 16 vecs
diff --git a/include/qphix/qdp_packer_parscalar.h b/include/qphix/qdp_packer_parscalar.h
index caab9e1f..c53b5794 100644
--- a/include/qphix/qdp_packer_parscalar.h
+++ b/include/qphix/qdp_packer_parscalar.h
@@ -1,7 +1,7 @@
-#ifndef QPHIX_QDP_PACKER_H
-#define QPHIX_QDP_PACKER_H
-
+#ifndef QPHIX_QDP_PACKER_PARSCALAR_H
+#define QPHIX_QDP_PACKER_PARSCALAR_H
 
+#warning "building with parscalar packers"
 
 #include "qdp.h"
 #include "qphix/geometry.h"
@@ -181,62 +181,6 @@ namespace QPhiX {
       
   }
 
-  template<typename FT, int veclen, int soalen, bool compress, typename QDPSpinor>
-  void qdp_pack_spinor(const QDPSpinor& psi_in, 
-           typename Geometry<FT,veclen,soalen, compress>::FourSpinorBlock* psi_even, 
-           typename Geometry<FT,veclen,soalen, compress>::FourSpinorBlock* psi_odd,
-           Geometry<FT,veclen,soalen,compress>& s) 
-  {
-    qdp_pack_cb_spinor(psi_in,psi_even,s,0);
-    qdp_pack_cb_spinor(psi_in,psi_odd,s,1);
-  }
-
-#if 0
-  template<typename FT, int veclen, int soalen, bool compress, typename QDPSpinor>
-  void qdp_pack_spinor(const QDPSpinor& psi_in, 
-           typename Geometry<FT,veclen,soalen, compress>::FourSpinorBlock* psi_even, 
-           typename Geometry<FT,veclen,soalen, compress>::FourSpinorBlock* psi_odd,
-           Geometry<FT,veclen,soalen,compress>& s) 
-  { 
-    // Get the subgrid latt size.
-    int Nt = s.Nt();
-    int Nz = s.Nz();
-    int Ny = s.Ny();
-    int Nxh = s.Nxh();
-    int nvecs = s.nVecs();
-    int Pxy = s.getPxy();
-    int Pxyz = s.getPxyz();
-
-#pragma omp parallel for collapse(4)
-      for(int t=0; t < Nt; t++) {
-  for(int z=0; z < Nz; z++) {
-    for(int y=0; y < Ny; y++) {
-      for(int s=0; s < nvecs; s++) { 
-        for(int col=0; col < 3; col++)  {
-    for(int spin=0; spin < 4; spin++) { 
-      for(int x=0; x < soalen; x++) { 
-
-        int ind = t*Pxyz+z*Pxy+y*nvecs+s; //((t*Nz+z)*Ny+y)*nvecs+s;
-        int x_coord = s*soalen + x;
-        
-        int qdp_ind = ((t*Nz + z)*Ny + y)*Nxh + x_coord;
-        psi_even[ind][col][spin][0][x] = psi_in.elem(rb[0].start()+qdp_ind).elem(spin).elem(col).real();
-        psi_even[ind][col][spin][1][x] = psi_in.elem(rb[0].start()+qdp_ind).elem(spin).elem(col).imag();
-        
-        psi_odd[ind][col][spin][0][x] = psi_in.elem(rb[1].start()+qdp_ind).elem(spin).elem(col).real();
-        psi_odd[ind][col][spin][1][x] = psi_in.elem(rb[1].start()+qdp_ind).elem(spin).elem(col).imag();
-      
-      }
-    }
-        }
-      }
-    }
-  }
-      }
-      
-  }
-#endif
-  
 
   template<typename FT, int veclen, int soalen, bool compress, typename QDPSpinor>
     void qdp_unpack_cb_spinor(typename Geometry<FT,veclen,soalen,compress>::FourSpinorBlock* chi_packed, 
@@ -279,299 +223,6 @@ namespace QPhiX {
     }
   }
 
-  template<typename FT, int veclen, int soalen, bool compress, typename QDPSpinor>
-    void qdp_unpack_spinor(typename Geometry<FT,veclen,soalen,compress>::FourSpinorBlock* chi_even, 
-         typename Geometry<FT,veclen,soalen,compress>::FourSpinorBlock* chi_odd,
-         QDPSpinor& chi,
-         Geometry<FT,veclen,soalen,compress>& s) 
-  {
-    qdp_unpack_cb_spinor(chi_even,chi,s,0);
-    qdp_unpack_cb_spinor(chi_odd,chi,s,1);
-  }
-
-#if 0
-  template<typename FT, int veclen, int soalen, bool compress, typename QDPSpinor>
-    void qdp_unpack_spinor(typename Geometry<FT,veclen,soalen,compress>::FourSpinorBlock* chi_even, 
-         typename Geometry<FT,veclen,soalen,compress>::FourSpinorBlock* chi_odd,
-         QDPSpinor& chi,
-         Geometry<FT,veclen,soalen,compress>& s) 
-  { 
-
-    int Nt = s.Nt();
-    int Nz = s.Nz();
-    int Ny = s.Ny();
-    int Nxh = s.Nxh();
-    int nvecs = s.nVecs();
-    int Pxy = s.getPxy();
-    int Pxyz = s.getPxyz();
-
-
-#pragma omp parallel for collapse(4)    
-    for(int t=0; t < Nt; t++) {
-      for(int z=0; z < Nz; z++) {
-  for(int y=0; y < Ny; y++) {
-    for(int s=0; s < nvecs; s++) { 
-      for(int spin=0; spin < 4; spin++) { 
-        for(int col=0; col < 3; col++)  {
-    for(int x=0; x < soalen; x++) { 
-
-      int ind = t*Pxyz+z*Pxy+y*nvecs+s; //((t*Nz+z)*Ny+y)*nvecs+s;
-      int x_coord = s*soalen + x;
-      
-      int qdp_ind = ((t*Nz + z)*Ny + y)*Nxh + x_coord;
-      
-      chi.elem(rb[0].start()+qdp_ind).elem(spin).elem(col).real() =  chi_even[ind][col][spin][0][x];
-      chi.elem(rb[0].start()+qdp_ind).elem(spin).elem(col).imag() =  chi_even[ind][col][spin][1][x];
-
-      chi.elem(rb[1].start()+qdp_ind).elem(spin).elem(col).real() =  chi_odd[ind][col][spin][0][x];
-      chi.elem(rb[1].start()+qdp_ind).elem(spin).elem(col).imag() =  chi_odd[ind][col][spin][1][x];
-
-    }
-        }
-      }
-    }
-  }
-      }
-    }
-  }
-#endif
-
-#if defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE)
-
-  // Downconvert an array of float-vecs to an array of float 16 vecs
-  void downconvert_array(const float *from, half *to, const unsigned int nvecs)
-  {
-#pragma omp parallel for shared(from,to)
-    for(int i=0; i < nvecs; i++) { 
-      __m512 in = _mm512_load_ps((void *)(from+16*i));
-  #if defined (QPHIX_MIC_SOURCE)
-      _mm512_extstore_ps((void *)(to+16*i),in,_MM_DOWNCONV_PS_FLOAT16,_MM_HINT_NT);
-  #elif defined(QPHIX_AVX512_SOURCE)
-    _mm256_store_si256((__m256i *)(to+16*i), _mm512_cvtps_ph(in, _MM_FROUND_TO_NEAREST_INT) );
-  #endif
-    }
-  }
-
-  // Upconvert an array of float16 vecs to an array of float 32
-  void upconvert_array(const half* from, float *to, const unsigned int nvecs)
-  {
-#pragma omp parallel for shared(from, to)
-    for(int i=0; i < nvecs; i++) { 
-  #if defined (QPHIX_MIC_SOURCE)
-      __m512 in = _mm512_extload_ps((void *)(from+16*i),_MM_UPCONV_PS_FLOAT16, _MM_BROADCAST32_NONE, _MM_HINT_T0);
-      _mm512_storenrngo_ps((void *)(to+16*i),in);
-  #elif defined(QPHIX_AVX512_SOURCE)
-    __m512 in = _mm512_cvtph_ps(_mm256_load_si256((__m256i*)(from+16*i) ) );
-    _mm512_stream_ps((void *)(to+16*i),in);
-  #endif
-    }
-  }
-
-
-  // Half precision packers... 
- template<int soalen, bool compress, typename QDPGauge>
-   void qdp_pack_gauge(const QDPGauge& u,  
-           typename Geometry<half,16,soalen,compress>::SU3MatrixBlock *u_cb0, 
-           typename Geometry<half,16,soalen,compress>::SU3MatrixBlock *u_cb1, 
-           Geometry<half,16,soalen,compress>& s)
- {
-   
-   typedef typename Geometry<float,16,soalen,compress>::SU3MatrixBlock GaugeF;
-   
-   int latt_size[4];
-   int By,Bz,NCores,Sy,Sz,PadXY,PatXYZ,MinCt;
-   latt_size[0]=s.Nx();
-   latt_size[1]=s.Ny();
-   latt_size[2]=s.Nz();
-   latt_size[3]=s.Nt();
-   // Make a geometry for allocating
-   Geometry<float,16,soalen,compress> g_float(latt_size,
-                s.getBy(),
-                s.getBz(),
-                s.getNumCores(),
-                s.getSy(),
-                s.getSz(),
-                s.getPadXY(),
-                s.getPadXYZ(),
-                s.getMinCt());
-              
-   GaugeF* tmp_cb0 = (GaugeF *)g_float.allocCBGauge();
-   GaugeF* tmp_cb1 = (GaugeF *)g_float.allocCBGauge();
-   // CHECK THESE ALLOCATIONS
-
-   // OK Now pack the float
-   qdp_pack_gauge<float,16,soalen,compress,QDPGauge>(u, tmp_cb0, tmp_cb1, g_float);
-
-   // This is copied out of the allocation routine. 
-   // Basically we take all that we have allocated
-   // and divide by veclen*sizeof(float) to get the number 
-   // of vectors to downconvert
-   unsigned int n_f_vecs = (((s.getPxyz()*s.Nt()*soalen)/16)*sizeof(GaugeF))/(16*sizeof(float));
-
-   downconvert_array((float *)tmp_cb0, (half *)u_cb0, n_f_vecs);
-   downconvert_array((float *)tmp_cb1, (half *)u_cb1, n_f_vecs);
-
-   g_float.free(tmp_cb0);
-   g_float.free(tmp_cb1);
-
- }
-
- template<int soalen, bool compress, typename QDPSpinor>
-   void qdp_pack_spinor(const QDPSpinor& psi_in, 
-      typename Geometry<half,16,soalen, compress>::FourSpinorBlock* psi_even, 
-      typename Geometry<half,16,soalen, compress>::FourSpinorBlock* psi_odd,
-      Geometry<half,16,soalen,compress>& s) 
-   {
-     typedef typename Geometry<float,16,soalen,compress>::FourSpinorBlock SpinorF;
-
-     int latt_size[4];
-     int By,Bz,NCores,Sy,Sz,PadXY,PatXYZ,MinCt;
-     latt_size[0]=s.Nx();
-     latt_size[1]=s.Ny();
-     latt_size[2]=s.Nz();
-     latt_size[3]=s.Nt();
-     // Make a geometry for allocating
-     Geometry<float,16,soalen,compress> g_float(latt_size,
-            s.getBy(),
-            s.getBz(),
-            s.getNumCores(),
-            s.getSy(),
-            s.getSz(),
-            s.getPadXY(),
-            s.getPadXYZ(),
-            s.getMinCt());
-     
-     
-     SpinorF* tmp_cb0_alloc = (SpinorF *)g_float.allocCBFourSpinor();
-     SpinorF* tmp_cb1_alloc = (SpinorF *)g_float.allocCBFourSpinor();
-
-     SpinorF* tmp_cb0 = tmp_cb0_alloc;
-     SpinorF* tmp_cb1 = tmp_cb1_alloc;
-  
-     // CHECK THESE ALLOCATIONS
-     
-     // OK Now pack the float
-     qdp_pack_spinor<float,16,soalen,compress,QDPSpinor>(psi_in, tmp_cb0, tmp_cb1, g_float);
- 
-     // This is copied out of the allocation routine. 
-     // Basically we take all that we have allocated
-     // and divide by veclen*sizeof(float) to get the number 
-     // of vectors to downconvert
- 
-     unsigned int n_f_vecs = ((s.getPxyz()*s.Nt())*sizeof(SpinorF))/(16*sizeof(float));
-
-     downconvert_array((float *)tmp_cb0, (half *)psi_even, n_f_vecs);
-     downconvert_array((float *)tmp_cb1, (half *)psi_odd,  n_f_vecs);
-
-     g_float.free(tmp_cb0_alloc);
-     g_float.free(tmp_cb1_alloc);
-
-   }
- 
- template<int soalen, bool compress, typename QDPSpinor>
-   void qdp_unpack_spinor(typename Geometry<half,16,soalen,compress>::FourSpinorBlock* chi_even, 
-        typename Geometry<half,16,soalen,compress>::FourSpinorBlock* chi_odd,
-        QDPSpinor& chi,
-        Geometry<half,16,soalen,compress>& s) 
-   {
-     
-     typedef typename Geometry<float,16,soalen,compress>::FourSpinorBlock SpinorF;
-     
-     int latt_size[4];
-     int By,Bz,NCores,Sy,Sz,PadXY,PatXYZ,MinCt;
-     latt_size[0]=s.Nx();
-     latt_size[1]=s.Ny();
-     latt_size[2]=s.Nz();
-     latt_size[3]=s.Nt();
-     // Make a geometry for allocating
-     Geometry<float,16,soalen,compress> g_float(latt_size,
-            s.getBy(),
-            s.getBz(),
-            s.getNumCores(),
-            s.getSy(),
-            s.getSz(),
-            s.getPadXY(),
-            s.getPadXYZ(),
-            s.getMinCt());
-     
-     
-     // FIXME: CHECK THESE ALLOCATIONS
-     SpinorF* tmp_cb0_alloc = (SpinorF *)g_float.allocCBFourSpinor();
-     SpinorF* tmp_cb1_alloc = (SpinorF *)g_float.allocCBFourSpinor();
-
-     SpinorF* tmp_cb0 = tmp_cb0_alloc;
-     SpinorF* tmp_cb1 = tmp_cb1_alloc;
-
-     // This is copied out of the allocation routine. 
-     // Basically we take all that we have allocated
-     // and divide by veclen*sizeof(float) to get the number 
-     // of vectors to downconvert
- 
-     unsigned int n_f_vecs = ((s.getPxyz()*s.Nt())*sizeof(SpinorF))/(16*sizeof(float));
-     
-     upconvert_array((half *)chi_even, (float *)tmp_cb0, n_f_vecs);
-     upconvert_array((half *)chi_odd, (float *)tmp_cb1, n_f_vecs);
-     
-     // OK Now pack the float
-     qdp_unpack_spinor<float,16,soalen,compress,QDPSpinor>(tmp_cb0, tmp_cb1,chi, g_float);
-     
-//std::cout << "spinor up free in" << std::endl;
-     g_float.free(tmp_cb0_alloc);
-     g_float.free(tmp_cb1_alloc);
-//std::cout << "spinor up free out" << std::endl;
-
-   }
-
-#ifdef QPHIX_BUILD_CLOVER
-  template<int soalen, bool compress, typename ClovTerm>
-    void qdp_pack_clover(const ClovTerm& qdp_clov_in,
-       typename Geometry<half,16,soalen,compress>::CloverBlock* cl_out,
-       Geometry<half,16,soalen,compress>& s, int cb)
-  {
-     typedef typename Geometry<float,16,soalen,compress>::CloverBlock ClovF;
-     
-     int latt_size[4];
-     int By,Bz,NCores,Sy,Sz,PadXY,PatXYZ,MinCt;
-     latt_size[0]=s.Nx();
-     latt_size[1]=s.Ny();
-     latt_size[2]=s.Nz();
-     latt_size[3]=s.Nt();
-     // Make a geometry for allocating
-     Geometry<float,16,soalen,compress> g_float(latt_size,
-            s.getBy(),
-            s.getBz(),
-            s.getNumCores(),
-            s.getSy(),
-            s.getSz(),
-            s.getPadXY(),
-            s.getPadXYZ(),
-            s.getMinCt());
-        
-     
-     
-     
-     ClovF* tmp_clov = (ClovF *)g_float.allocCBClov();
-
-     // This is copied out of the allocation routine. 
-     // Basically we take all that we have allocated
-     // and divide by veclen*sizeof(float) to get the number 
-     // of vectors to downconvert
-
-     unsigned int n_f_vecs = (( (s.getPxyz()*s.Nt()*soalen) / 16  )*sizeof(ClovF))/(16*sizeof(float));
-    
-
-     // OK Now pack the float
-     qdp_pack_clover<float,16,soalen,compress,ClovTerm>(qdp_clov_in, tmp_clov, g_float, cb);
-     
-     downconvert_array((float *)tmp_clov,(half *)cl_out,n_f_vecs);
-//std::cout << "clov down free in" << std::endl;
-     g_float.free(tmp_clov);
-//std::cout << "clov down free out" << std::endl;
-  }
-#endif // Build Clover
-
-#endif  // if defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE)
-
 };
 
 
diff --git a/include/qphix/qdp_packer_qdpjit.h b/include/qphix/qdp_packer_qdpjit.h
new file mode 100644
index 00000000..3a812b7c
--- /dev/null
+++ b/include/qphix/qdp_packer_qdpjit.h
@@ -0,0 +1,339 @@
+#ifndef QPHIX_QDP_PACKER_QDPJIT_H
+#define QPHIX_QDP_PACKER_QDPJIT_H
+
+#warning "building with QDPJIT packers"
+
+
+#include "qdp.h"
+#include "qphix/geometry.h"
+
+#include "qphix/dslash_def.h"
+#include "qphix/qphix_config.h"
+
+#if defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE)
+#include <immintrin.h>
+#endif
+
+#ifdef QPHIX_BUILD_CLOVER
+#include "qphix/clover_dslash_def.h"
+#endif
+
+using namespace QDP;
+
+namespace QPhiX { 
+
+
+  template<typename FT, int veclen, int soalen, bool compress, typename QDPGauge>
+    void qdp_pack_gauge(const QDPGauge& u,  
+      typename Geometry<FT,veclen,soalen,compress>::SU3MatrixBlock *u_cb0, 
+      typename Geometry<FT,veclen,soalen,compress>::SU3MatrixBlock *u_cb1, 
+      Geometry<FT,veclen,soalen,compress>& s)
+  {
+    // Get the subgrid latt size.
+    int Nt = s.Nt();
+    int Nz = s.Nz();
+    int Ny = s.Ny();
+    int nvecs = s.nVecs();
+    int nyg = s.nGY();
+    int Pxy = s.getPxy();
+    int Pxyz = s.getPxyz();
+	int qdp_inner_dim = (int)getDataLayoutInnerSize();
+	const int n_complex_dim = 2;
+
+    // Shift the lattice to get U(x-mu)
+    QDPGauge u_minus(4);
+    for(int mu=0; mu < 4; mu++) {
+      u_minus[mu] = shift(u[mu], BACKWARD, mu);
+    }
+
+    // This ought to be the underlying precision type (float/double) in use
+    // by QDP-JIT and may well be different from FT
+    // So casting will be needed
+    using WT = typename WordType<QDPGauge>::Type_t;
+
+#pragma omp parallel for collapse(4)
+		for(int t = 0; t < Nt; t++) {
+			for(int z = 0; z < Nz; z++) {
+				for(int y = 0; y < Ny; y++) {
+					for(int s = 0; s < nvecs; s++) {
+						for(int mu = 0; mu < 4; mu++) {
+
+						    const WT* u_ptr = (const WT*)((u[mu]).getFjit());
+							const WT* u_minus_ptr = (const WT*)((u_minus[mu]).getFjit());
+
+							int outer_c = 3;
+							if ( compress ) {
+								outer_c = 2;
+							}
+							for(int c = 0; c < outer_c; c++) {
+								for(int c2 = 0; c2 < 3; c2++) {
+									for(int x = 0; x < soalen; x++) {
+
+
+										int block = (t*Pxyz+z*Pxy)/nyg+(y/nyg)*nvecs+s;
+										int xx = (y%nyg)*soalen+x;
+
+										// QDP-JIT in OCSRI layout:
+										//  Outer x Color x Spin x Real x Inner
+										//
+										// However, it hardly matters maybe because
+										// There is no spin on the gauge fields
+
+										int qdpsite = x + soalen*(s + nvecs*(y + Ny*(z + Nz*t)));
+
+										int qdpsite_rb0 = qdpsite+rb[0].start();
+										int qdp_outer_rb0 = qdpsite_rb0 / qdp_inner_dim;
+										int qdp_inner_rb0 = qdpsite_rb0 % qdp_inner_dim;
+										int rb0_offset = qdp_inner_rb0 + qdp_inner_dim*n_complex_dim*Nc*Nc*qdp_outer_rb0;
+
+										u_cb0[block][2*mu][c][c2][RE][xx] = (FT)u_minus_ptr[rb0_offset + qdp_inner_dim*(RE+n_complex_dim*(c + Nc*c2)) ];
+										u_cb0[block][2*mu][c][c2][IM][xx] = (FT)u_minus_ptr[rb0_offset + qdp_inner_dim*(IM+n_complex_dim*(c + Nc*c2)) ];
+										u_cb0[block][2*mu+1][c][c2][RE][xx] = (FT)u_ptr[rb0_offset + qdp_inner_dim*(RE+n_complex_dim*(c + Nc*c2)) ];
+										u_cb0[block][2*mu+1][c][c2][IM][xx] = (FT)u_ptr[rb0_offset + qdp_inner_dim*(IM+n_complex_dim*(c + Nc*c2)) ];
+
+										int qdpsite_rb1 = qdpsite+rb[1].start();
+										int qdp_outer_rb1 = qdpsite_rb1 / qdp_inner_dim;
+										int qdp_inner_rb1 = qdpsite_rb1 % qdp_inner_dim;
+										int rb1_offset = qdp_inner_rb1 + qdp_inner_dim*n_complex_dim*Nc*Nc*qdp_outer_rb1;
+
+										u_cb1[block][2*mu][c][c2][RE][xx] = (FT)u_minus_ptr[rb1_offset + qdp_inner_dim*(RE+n_complex_dim*(c + Nc*c2)) ];
+										u_cb1[block][2*mu][c][c2][IM][xx] = (FT)u_minus_ptr[rb1_offset + qdp_inner_dim*(IM+n_complex_dim*(c + Nc*c2)) ];
+										u_cb1[block][2*mu+1][c][c2][RE][xx] = (FT)u_ptr[rb1_offset + qdp_inner_dim*(RE+n_complex_dim*(c + Nc*c2)) ];
+										u_cb1[block][2*mu+1][c][c2][IM][xx] = (FT)u_ptr[rb1_offset + qdp_inner_dim*(IM+n_complex_dim*(c + Nc*c2)) ];
+
+									}
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+
+
+	}
+
+#ifdef QPHIX_BUILD_CLOVER
+
+	// This accesses the Internals of the LLVMCloverTerm
+	template<typename FT, int veclen, int soalen, bool compress, typename ClovTerm>
+	void qdp_pack_clover(const ClovTerm& qdp_clov_in,
+			typename ClovDslash<FT,veclen,soalen,compress>::CloverBlock* cl_out,Geometry<FT,veclen,soalen,compress>& s, int cb)
+	{
+		// Get the subgrid latt size.
+		int Nt = s.Nt();
+		int Nz = s.Nz();
+		int Ny = s.Ny();
+		int nvecs = s.nVecs();
+		int nyg = s.nGY();
+		int Pxy = s.getPxy();
+		int Pxyz = s.getPxyz();
+
+		// Sanity Check
+		// QDP Type is
+		// Outer x 2 chiral blocks x 6 floats x inner sites
+		// JIT-LLVM CLOVER will need to expose DiagType and OffDiagType
+		// It does in the testcase here but make sure chroma version does it too.
+
+		using DiagType = typename ClovTerm::DiagType;
+		using OffDiagType = typename ClovTerm::OffDiagType;
+
+		// This is the base type used here. Either double or float
+		using WT = typename WordType<DiagType>::Type_t;
+
+		const DiagType& diag_term = qdp_clov_in.getDiagBuffer();
+		const OffDiagType& off_diag_term = qdp_clov_in.getOffDiagBuffer();
+
+		const WT* diag_buf = (const WT*)diag_term.getFjit();
+		const WT* off_diag_buf = (const WT*)off_diag_term.getFjit();
+
+		int qdp_inner_dim = (int)getDataLayoutInnerSize();
+		const int n_complex_dim = 2;
+		const int diag_dim = 6;
+		const int offdiag_dim = 15;
+		const int chiral_dim = 2;
+		const int diag_offset = qdp_inner_dim*diag_dim*chiral_dim;
+		const int offdiag_offset = qdp_inner_dim*offdiag_dim*n_complex_dim*chiral_dim;
+
+		// No elem calls in parallel region
+#pragma omp parallel for collapse(4)
+		for(int t = 0; t < Nt; t++) {
+			for(int z = 0; z < Nz; z++) {
+				for(int y = 0; y < Ny; y++) {
+					for(int s = 0; s < nvecs; s++) {
+						for(int x = 0; x < soalen; x++) {
+
+							int block = (t*Pxyz+z*Pxy)/nyg+(y/nyg)*nvecs+s;
+							int xx = (y%nyg)*soalen+x;
+
+							int qdpsite = x + soalen*(s + nvecs*(y + Ny*(z + Nz*t)))+rb[cb].start();
+							int qdp_outer_idx = qdpsite / qdp_inner_dim;
+							int qdp_inner_idx = qdpsite % qdp_inner_dim;
+
+							const WT* diag_base = &diag_buf[diag_offset*qdp_outer_idx];
+							const WT* offdiag_base = &off_diag_buf[offdiag_offset*qdp_outer_idx];
+
+							// WARNING: THIS WORKS ONLY IN OCSRI Layout in QDP_JIT
+							//
+							// For OSCRI Layout: chiral component and diagonal must be transposed.
+							//                   chiral component and off diagonal must also be transposed
+
+							// Logical Outer x Spin x Color x Inner => Outer x Comp x Diag x Inner
+							// But we are using OCSRI => Outer Diag Comp Inner
+
+							for(int d=0; d < 6; d++) {
+								cl_out[block].diag1[d][xx] = (FT)(diag_base[ qdp_inner_idx + qdp_inner_dim*(0 + chiral_dim*d ) ]);
+							}
+
+							for(int od=0; od < 15; od++) {
+
+								cl_out[block].off_diag1[od][RE][xx] = (FT)(offdiag_base[ qdp_inner_idx +
+										qdp_inner_dim*(RE + n_complex_dim*(0 + chiral_dim*od))]);
+
+								cl_out[block].off_diag1[od][IM][xx] = (FT)(offdiag_base[ qdp_inner_idx +
+										qdp_inner_dim*(IM + n_complex_dim*(0 + chiral_dim*od))]);
+
+							}
+
+							for(int d=0; d < 6; d++) {
+								cl_out[block].diag2[d][xx] = (FT)(diag_base[ qdp_inner_idx + qdp_inner_dim*(1 + chiral_dim*d ) ]);
+							}
+
+							for(int od=0; od < 15; od++) {
+								cl_out[block].off_diag2[od][RE][xx] = (FT)(offdiag_base[ qdp_inner_idx +
+										qdp_inner_dim*(RE + n_complex_dim*(1 + chiral_dim*od))]);
+
+								cl_out[block].off_diag2[od][IM][xx] = (FT)(offdiag_base[ qdp_inner_idx +
+										qdp_inner_dim*(IM + n_complex_dim*(1 + chiral_dim*od))]);
+
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+
+
+
+#endif  // IFDEF BUILD CLOVER
+
+
+	template<typename FT, int veclen, int soalen, bool compress, typename QDPSpinor>
+	void qdp_pack_cb_spinor(const QDPSpinor& psi_in,
+			typename Geometry<FT,veclen,soalen, compress>::FourSpinorBlock* psi,
+			Geometry<FT,veclen,soalen,compress>& s,
+			int cb)
+	{
+		// Get the subgrid latt size.
+		int Nt = s.Nt();
+		int Nz = s.Nz();
+		int Ny = s.Ny();
+		int Nxh = s.Nxh();
+		int nvecs = s.nVecs();
+		int Pxy = s.getPxy();
+		int Pxyz = s.getPxyz();
+
+		int qdp_inner_dim = (int)getDataLayoutInnerSize();
+		const int n_complex_dim = 2;
+		using WT =typename WordType<QDPSpinor>::Type_t;
+
+		const int outer_block_size = qdp_inner_dim*n_complex_dim*Nc*Ns;
+
+#pragma omp parallel for collapse(4)
+		for(int t=0; t < Nt; t++) {
+			for(int z=0; z < Nz; z++) {
+				for(int y=0; y < Ny; y++) {
+					for(int s=0; s < nvecs; s++) {
+						for(int col=0; col < 3; col++) {
+							for(int spin=0; spin < 4; spin++) {
+								for(int x=0; x < soalen; x++) {
+
+									int ind = t*Pxyz+z*Pxy+y*nvecs+s; //((t*Nz+z)*Ny+y)*nvecs+s;
+									int x_coord = s*soalen + x;
+
+									int qdp_index = ((t*Nz + z)*Ny + y)*Nxh + x_coord + rb[cb].start();
+									int qdp_inner_idx = qdp_index % qdp_inner_dim;
+									int qdp_outer_idx = qdp_index / qdp_inner_dim;
+
+									WT* psiptr=(WT *)(psi_in.getFjit());
+
+									// ASSUME OCSRI order. So the fixed points our
+									// qdp_outer*block-size + qdp_inner
+									// and the iteration is over the spinor as Inner x Complex x Spin x Color
+
+									int offset =  qdp_inner_idx + outer_block_size*qdp_outer_idx;
+									psi[ind][col][spin][0][x] = (FT)psiptr[offset+qdp_inner_dim*(RE + n_complex_dim*(spin + Ns*col)) ];
+
+									psi[ind][col][spin][1][x] = (FT)psiptr[offset+qdp_inner_dim*(IM + n_complex_dim*(spin + Ns*col)) ];
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+
+	}
+
+
+  template<typename FT, int veclen, int soalen, bool compress, typename QDPSpinor>
+    void qdp_unpack_cb_spinor(typename Geometry<FT,veclen,soalen,compress>::FourSpinorBlock* chi_packed, 
+            QDPSpinor& chi,
+            Geometry<FT,veclen,soalen,compress>& s,
+            int cb) 
+  { 
+    int Nt = s.Nt();
+    int Nz = s.Nz();
+    int Ny = s.Ny();
+    int Nxh = s.Nxh();
+    int nvecs = s.nVecs();
+    int Pxy = s.getPxy();
+    int Pxyz = s.getPxyz();
+
+    int qdp_inner_dim = (int)getDataLayoutInnerSize();
+    const int n_complex_dim = 2;
+    using WT =typename WordType<QDPSpinor>::Type_t;
+
+    const int outer_block_size = qdp_inner_dim*n_complex_dim*Nc*Ns;
+
+
+#pragma omp parallel for collapse(4)
+		for(int t=0; t < Nt; t++) {
+			for(int z=0; z < Nz; z++) {
+				for(int y=0; y < Ny; y++) {
+					for(int s=0; s < nvecs; s++) {
+						for(int spin=0; spin < 4; spin++) {
+							for(int col=0; col < 3; col++) {
+								for(int x=0; x < soalen; x++) {
+
+									int ind = t*Pxyz+z*Pxy+y*nvecs+s; //((t*Nz+z)*Ny+y)*nvecs+s;
+									int x_coord = s*soalen + x;
+
+									int qdp_index = ((t*Nz + z)*Ny + y)*Nxh + x_coord + rb[cb].start();
+									int qdp_inner_idx = qdp_index % qdp_inner_dim;
+									int qdp_outer_idx = qdp_index / qdp_inner_dim;
+
+									WT* chiptr=(WT *)(chi.getFjit());
+									// ASSUME OCSRI order. So the fixed points our
+									// qdp_outer*block-size + qdp_inner
+									// and the iteration is over the spinor as Inner x Complex x Spin x Color
+
+									int offset = qdp_inner_idx + outer_block_size*qdp_outer_idx;
+
+									chiptr[offset+qdp_inner_dim*(RE + n_complex_dim*(spin + Ns*col)) ] = (WT)chi_packed[ind][col][spin][0][x];
+									chiptr[offset+qdp_inner_dim*(IM + n_complex_dim*(spin + Ns*col)) ] = (WT)chi_packed[ind][col][spin][1][x];
+
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+
+};
+
+
+#endif
diff --git a/tests/testClovDslashFull.cc b/tests/testClovDslashFull.cc
index ac200351..ce0c7475 100644
--- a/tests/testClovDslashFull.cc
+++ b/tests/testClovDslashFull.cc
@@ -265,7 +265,7 @@ testClovDslashFull::runTest(void)
   QDPIO::cout << " clparam.clovCoeffR = " << clparam.clovCoeffR << std::endl;
   QDPIO::cout << " clparam.clovCoeffT = " << clparam.clovCoeffT << std::endl;
   clov_qdp.create(u, clparam);
-  clov_qdp.printDiag();
+ // clov_qdp.printDiag();
 
   QDPIO::cout << "Copying into InvClover Term" << endl;
   CloverTermT<Phi, U> invclov_qdp(clov_qdp);
@@ -404,6 +404,7 @@ testClovDslashFull::runTest(void)
       int target_cb = cb;
 
       clov_chi = zero;
+      chi = zero;
       qdp_pack_spinor<>(chi, chi_even, chi_odd, geom);
 
 
@@ -475,7 +476,7 @@ testClovDslashFull::runTest(void)
 
 
 
-#if 0
+#if 1
   // Go through the test cases -- apply SSE dslash versus, QDP Dslash 
   // Test ax - bDslash y
   QDPIO::cout << "Testing dslashAchiMinusBDPsi" << endl;
@@ -538,7 +539,7 @@ testClovDslashFull::runTest(void)
 #endif
 
 
-#if 0
+#if 1
   // Test only Dslash operator.
   // For clover this will be: A^{-1}_(1-cb,1-cb) D_(1-cb, cb)  psi_cb
   QDPIO::cout << "Testing Dslash With antiperiodic BCs \n" << endl;
@@ -671,7 +672,7 @@ testClovDslashFull::runTest(void)
 
 
 
-#if 0
+#if 1
   // Go through the test cases -- apply SSE dslash versus, QDP Dslash 
   // Test ax - bDslash y
   QDPIO::cout << "Testing dslashAchiMinusBDPsi" << endl;
@@ -734,7 +735,7 @@ testClovDslashFull::runTest(void)
 #endif
 
   // Disabling testing the even odd operator until recoded with new vectorization
-#if 0
+#if 1
   QDPIO::cout << "Testing Even Odd Operator" << endl;
   t_boundary=(double)(-1);
   EvenOddCloverOperator<FT,V,S,compress> M(u_packed,  
@@ -801,7 +802,7 @@ testClovDslashFull::runTest(void)
 
 #endif
 
-#if 0
+#if 1
   {
     chi = zero;
     qdp_pack_cb_spinor<>(chi, chi_s[1], geom,1);
@@ -859,7 +860,7 @@ testClovDslashFull::runTest(void)
   }
 #endif
 
-#if 0
+#if 1
   {
     chi = zero;
     qdp_pack_spinor<>(chi, chi_even, chi_odd, geom);