Skip to content

nvc+MPI issues with non-grid devito dimensions #2277

Closed
@deckerla

Description

@deckerla

Consider the following MFE which simulates wave propagation and projection into a frequency basis followed by frequency extraction:

import devito
import numpy as np
shape = (51,51,51)
nfreq = 5
nrec = 6
space_order = 8
dt = 0.01
time_M = 200

grid = devito.Grid(shape=shape, extent=(1,1,1))
freq_dim = devito.DefaultDimension(name="freq_dim", default_value=nfreq)
f = devito.Function(name="frequencies", dimensions=(freq_dim,), shape=(nfreq,), grid=grid)
f.data[:] = np.arange(1,nfreq+1)
p0   = devito.TimeFunction(name="p0", grid=grid, time_order=2, space_order=space_order)
p0re = devito.Function(name="p0re", grid=grid, dimensions=(*grid.dimensions, freq_dim), shape=(*shape, nfreq), space_order=space_order)

src = devito.SparseTimeFunction(name="src", grid=grid, nt=time_M, npoint=1, coordinates=np.array([0.5,0.5,0.5]))

class CoordSlowSparseFunction(devito.SparseFunction):
    _sparse_position = 0

recdim = devito.Dimension(name="recdim")
p0recre = CoordSlowSparseFunction(name="p0recre", grid=grid, dimensions=(recdim, freq_dim), shape=(nrec, nfreq), npoint=nrec, coordinates=np.array([0.4,0.4,0.4]))

wp_eq = devito.Eq(p0.forward, -5/1000000*(p0.dx2 +  p0.dy2 + p0.dz2) + p0.backward)
inj_eq = src.inject(field=p0.forward, expr=devito.sin(2*3.14159*f*grid.time_dim*dt)/100000)
proj_eq_re = devito.Eq(p0re, p0re + devito.sin(2*3.14159*f*grid.time_dim*dt)*p0)
wp_proj_op = devito.Operator([wp_eq, inj_eq, proj_eq_re], name="propagator")

wp_proj_op.apply(time_M=time_M)

interp_eq_re = p0recre.interpolate(expr=p0re)

interp_op = devito.Operator([interp_eq_re], name="interpolator")

interp_op.apply()
print(p0recre.data)

Running this operator without MPI we have the following behavior:

cvx@cbox-lukedecker-extfwimpi:~$ DEVITO_LOGGING=DEBUG python mfe.py
Allocating host memory for frequencies(5,) [20 B]
Allocating host memory for src_coords(1, 3) [12 B]
Allocating host memory for p0recre_coords(6, 3) [72 B]
Operator `propagator` generated in 0.88 s
  * lowering.Clusters: 0.39 s (44.5 %)
     * specializing.Clusters: 0.24 s (27.4 %)
  * lowering.IET: 0.23 s (26.3 %)
  * lowering.Expressions: 0.22 s (25.2 %)
Flops reduction after symbolic optimization: [152 --> 54]
Allocating host memory for p0(3, 67, 67, 67) [3 MB]
Allocating host memory for p0re(67, 67, 67, 5) [6 MB]
Operator `propagator` fetched `/tmp/devito-jitcache-uid1000/94c579b990e202a56a5c59b81fe83ce007d2aff6.cpp` in 0.11 s from jit-cache
Operator `propagator` ran in 0.10 s
Global performance: [OI=0.55, 14.07 GFlops/s, 0.27 GPts/s]
Local performance:
  * multipass0<> ran in 0.09 s [OI=0.55, 16.70 GFlops/s, 0.32 GPts/s]
    + section0 ran in 0.03 s [24.62%] 
    + section1 ran in 0.07 s [75.36%] 
  * section2<<200,1,5,2,2,2>,<200,1,5,2,2,2>> ran in 0.01 s [OI=12.48, 0.13 GFlops/s, 0.01 GPts/s]
Performance[mode=advanced] arguments: {'nthreads': 48, 'nthreads_nonaffine': 48, 'x0_blk0_size': 8, 'y0_blk0_size': 8}
Operator `interpolator` generated in 0.30 s
  * lowering.Clusters: 0.16 s (53.4 %)
     * specializing.Clusters: 0.12 s (40.1 %)
        * cire: 0.08 s (26.7 %)
  * lowering.IET: 0.09 s (30.1 %)
     * specializing.IET: 0.07 s (23.4 %)
Flops reduction after symbolic optimization: [36 --> 21]
Allocating host memory for p0recre(6, 5) [120 B]
Operator `interpolator` fetched `/tmp/devito-jitcache-uid1000/3129fe9dfcd34d4a03a2a7eaab3ade05ff9aa4be.cpp` in 0.02 s from jit-cache
Operator `interpolator` ran in 0.01 s
Global performance: [OI=3.05, 0.01 GFlops/s]
Local performance:
  * section0<<6,5>,<6,5,2,2,2>> ran in 0.01 s [OI=3.05, 0.01 GFlops/s, 0.00 GPts/s]
Performance[mode=advanced] arguments: {'nthreads_nonaffine': 48}
[[70.95639 96.57933 94.03625 83.82484 73.3117 ]
 [70.95639 96.57933 94.03625 83.82484 73.3117 ]
 [70.95639 96.57933 94.03625 83.82484 73.3117 ]
 [70.95639 96.57933 94.03625 83.82484 73.3117 ]
 [70.95639 96.57933 94.03625 83.82484 73.3117 ]
 [70.95639 96.57933 94.03625 83.82484 73.3117 ]]

With the following generated c code for the propagator:

#define _POSIX_C_SOURCE 200809L
#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);
#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;
#define MIN(a,b) (((a) < (b)) ? (a) : (b))

#include "stdlib.h"
#include "math.h"
#include "sys/time.h"
#include "xmmintrin.h"
#include "pmmintrin.h"
#include "omp.h"

struct dataobj
{
  void *restrict data;
  unsigned long * size;
  unsigned long * npsize;
  unsigned long * dsize;
  int * hsize;
  int * hofs;
  int * oofs;
  void * dmap;
} ;

struct profiler
{
  double multipass0;
  double section2;
  double section0;
  double section1;
} ;

extern "C" int propagator(struct dataobj *restrict frequencies_vec, struct dataobj *restrict p0_vec, struct dataobj *restrict p0re_vec, struct dataobj *restrict src_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int p_src_M, const int p_src_m, const int time_M, const int time_m, const int x0_blk0_size, const int y0_blk0_size, const int nthreads, const int nthreads_nonaffine, struct profiler * timers);


int propagator(struct dataobj *restrict frequencies_vec, struct dataobj *restrict p0_vec, struct dataobj *restrict p0re_vec, struct dataobj *restrict src_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int p_src_M, const int p_src_m, const int time_M, const int time_m, const int x0_blk0_size, const int y0_blk0_size, const int nthreads, const int nthreads_nonaffine, struct profiler * timers)
{
  float *restrict r1_vec __attribute__ ((aligned (64)));
  posix_memalign((void**)(&r1_vec),64,5*sizeof(float));

  float (*restrict frequencies) __attribute__ ((aligned (64))) = (float (*)) frequencies_vec->data;
  float (*restrict p0)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]]) p0_vec->data;
  float (*restrict p0re)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]]) p0re_vec->data;
  float (*restrict r1) __attribute__ ((aligned (64))) = (float (*)) r1_vec;
  float (*restrict src_coords)[src_coords_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[src_coords_vec->size[1]]) src_coords_vec->data;

  /* Flush denormal numbers to zero in hardware */
  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);

  float r2 = 1.0F/(h_x*h_x);
  float r3 = 1.0F/(h_y*h_y);
  float r4 = 1.0F/(h_z*h_z);

  for (int time = time_m, t0 = (time)%(3), t1 = (time + 2)%(3), t2 = (time + 1)%(3); time <= time_M; time += 1, t0 = (time)%(3), t1 = (time + 2)%(3), t2 = (time + 1)%(3))
  {
    START(multipass0)
    START(section0)
    #pragma omp parallel num_threads(nthreads)
    {
      #pragma omp for schedule(static,1)
      for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
      {
        r1[freq_dim] = sin(6.28318e-2F*time*frequencies[freq_dim]);
      }
    }
    STOP(section0,timers)

    START(section1)
    #pragma omp parallel num_threads(nthreads)
    {
      #pragma omp for collapse(2) schedule(dynamic,1)
      for (int x0_blk0 = x_m; x0_blk0 <= x_M; x0_blk0 += x0_blk0_size)
      {
        for (int y0_blk0 = y_m; y0_blk0 <= y_M; y0_blk0 += y0_blk0_size)
        {
          for (int x = x0_blk0; x <= MIN(x_M, x0_blk0 + x0_blk0_size - 1); x += 1)
          {
            for (int y = y0_blk0; y <= MIN(y_M, y0_blk0 + y0_blk0_size - 1); y += 1)
            {
              for (int z = z_m; z <= z_M; z += 1)
              {
                float r5 = 1.42361111100763e-5F*p0[t0][x + 8][y + 8][z + 8];
                p0[t2][x + 8][y + 8][z + 8] = r2*(r5 + 8.9285714284415e-9F*(p0[t0][x + 4][y + 8][z + 8] + p0[t0][x + 12][y + 8][z + 8]) + (-1.26984126982279e-7F)*(p0[t0][x + 5][y + 8][z + 8] + p0[t0][x + 11][y + 8][z + 8]) + 1.00000000005821e-6F*(p0[t0][x + 6][y + 8][z + 8] + p0[t0][x + 10][y + 8][z + 8]) + (-8.00000000046566e-6F)*(p0[t0][x + 7][y + 8][z + 8] + p0[t0][x + 9][y + 8][z + 8])) + r3*(r5 + 8.9285714284415e-9F*(p0[t0][x + 8][y + 4][z + 8] + p0[t0][x + 8][y + 12][z + 8]) + (-1.26984126982279e-7F)*(p0[t0][x + 8][y + 5][z + 8] + p0[t0][x + 8][y + 11][z + 8]) + 1.00000000005821e-6F*(p0[t0][x + 8][y + 6][z + 8] + p0[t0][x + 8][y + 10][z + 8]) + (-8.00000000046566e-6F)*(p0[t0][x + 8][y + 7][z + 8] + p0[t0][x + 8][y + 9][z + 8])) + r4*(r5 + 8.9285714284415e-9F*(p0[t0][x + 8][y + 8][z + 4] + p0[t0][x + 8][y + 8][z + 12]) + (-1.26984126982279e-7F)*(p0[t0][x + 8][y + 8][z + 5] + p0[t0][x + 8][y + 8][z + 11]) + 1.00000000005821e-6F*(p0[t0][x + 8][y + 8][z + 6] + p0[t0][x + 8][y + 8][z + 10]) + (-8.00000000046566e-6F)*(p0[t0][x + 8][y + 8][z + 7] + p0[t0][x + 8][y + 8][z + 9])) + p0[t1][x + 8][y + 8][z + 8];

                #pragma omp simd aligned(p0,p0re:64)
                for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
                {
                  p0re[x + 8][y + 8][z + 8][freq_dim] = r1[freq_dim]*p0[t0][x + 8][y + 8][z + 8] + p0re[x + 8][y + 8][z + 8][freq_dim];
                }
              }
            }
          }
        }
      }
    }
    STOP(section1,timers)
    STOP(multipass0,timers)

    START(section2)
    #pragma omp parallel num_threads(nthreads_nonaffine)
    {
      int chunk_size = (int)(fmax(1, (1.0F/3.0F)*(p_src_M - p_src_m + 1)/nthreads_nonaffine));
      #pragma omp for schedule(dynamic,chunk_size)
      for (int p_src = p_src_m; p_src <= p_src_M; p_src += 1)
      {
        for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
        {
          for (int rsrcx = 0; rsrcx <= 1; rsrcx += 1)
          {
            for (int rsrcy = 0; rsrcy <= 1; rsrcy += 1)
            {
              for (int rsrcz = 0; rsrcz <= 1; rsrcz += 1)
              {
                int posx = (int)(floor((-o_x + src_coords[p_src][0])/h_x));
                int posy = (int)(floor((-o_y + src_coords[p_src][1])/h_y));
                int posz = (int)(floor((-o_z + src_coords[p_src][2])/h_z));
                float px = -floor((-o_x + src_coords[p_src][0])/h_x) + (-o_x + src_coords[p_src][0])/h_x;
                float py = -floor((-o_y + src_coords[p_src][1])/h_y) + (-o_y + src_coords[p_src][1])/h_y;
                float pz = -floor((-o_z + src_coords[p_src][2])/h_z) + (-o_z + src_coords[p_src][2])/h_z;
                if (rsrcx + posx >= x_m - 1 && rsrcy + posy >= y_m - 1 && rsrcz + posz >= z_m - 1 && rsrcx + posx <= x_M + 1 && rsrcy + posy <= y_M + 1 && rsrcz + posz <= z_M + 1)
                {
                  float r0 = (1.0F/100000.0F)*(rsrcx*px + (1 - rsrcx)*(1 - px))*(rsrcy*py + (1 - rsrcy)*(1 - py))*(rsrcz*pz + (1 - rsrcz)*(1 - pz))*sin(6.28318e-2F*time*frequencies[freq_dim]);
                  #pragma omp atomic update
                  p0[t2][rsrcx + posx + 8][rsrcy + posy + 8][rsrcz + posz + 8] += r0;
                }
              }
            }
          }
        }
      }
    }
    STOP(section2,timers)
  }

  free(r1_vec);

  return 0;
}

And the interpolator:

#define _POSIX_C_SOURCE 200809L
#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);
#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;

#include "stdlib.h"
#include "math.h"
#include "sys/time.h"
#include "xmmintrin.h"
#include "pmmintrin.h"
#include "omp.h"

struct dataobj
{
  void *restrict data;
  unsigned long * size;
  unsigned long * npsize;
  unsigned long * dsize;
  int * hsize;
  int * hofs;
  int * oofs;
  void * dmap;
} ;

struct profiler
{
  double section0;
} ;

extern "C" int interpolator(struct dataobj *restrict p0re_vec, struct dataobj *restrict p0recre_vec, struct dataobj *restrict p0recre_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int recdim_M, const int recdim_m, const int nthreads_nonaffine, struct profiler * timers);


int interpolator(struct dataobj *restrict p0re_vec, struct dataobj *restrict p0recre_vec, struct dataobj *restrict p0recre_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int recdim_M, const int recdim_m, const int nthreads_nonaffine, struct profiler * timers)
{
  float (*restrict p0re)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]]) p0re_vec->data;
  float (*restrict p0recre)[p0recre_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[p0recre_vec->size[1]]) p0recre_vec->data;
  float (*restrict p0recre_coords)[p0recre_coords_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[p0recre_coords_vec->size[1]]) p0recre_coords_vec->data;

  /* Flush denormal numbers to zero in hardware */
  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);

  float r0 = 1.0F/h_x;
  float r1 = 1.0F/h_y;
  float r2 = 1.0F/h_z;

  START(section0)
  #pragma omp parallel num_threads(nthreads_nonaffine)
  {
    int chunk_size = (int)(fmax(1, (1.0F/3.0F)*(recdim_M - recdim_m + 1)/nthreads_nonaffine));
    #pragma omp for schedule(dynamic,chunk_size)
    for (int recdim = recdim_m; recdim <= recdim_M; recdim += 1)
    {
      for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
      {
        float r3 = floor(r0*(-o_x + p0recre_coords[recdim][0]));
        int posx = (int)r3;
        float r4 = floor(r1*(-o_y + p0recre_coords[recdim][1]));
        int posy = (int)r4;
        float r5 = floor(r2*(-o_z + p0recre_coords[recdim][2]));
        int posz = (int)r5;
        float px = r0*(-o_x + p0recre_coords[recdim][0]) - r3;
        float py = r1*(-o_y + p0recre_coords[recdim][1]) - r4;
        float pz = r2*(-o_z + p0recre_coords[recdim][2]) - r5;
        float sum = 0.0F;

        for (int rp0recrex = 0; rp0recrex <= 1; rp0recrex += 1)
        {
          for (int rp0recrey = 0; rp0recrey <= 1; rp0recrey += 1)
          {
            for (int rp0recrez = 0; rp0recrez <= 1; rp0recrez += 1)
            {
              if (rp0recrex + posx >= x_m - 1 && rp0recrey + posy >= y_m - 1 && rp0recrez + posz >= z_m - 1 && rp0recrex + posx <= x_M + 1 && rp0recrey + posy <= y_M + 1 && rp0recrez + posz <= z_M + 1)
              {
                sum += (rp0recrex*px + (1 - rp0recrex)*(1 - px))*(rp0recrey*py + (1 - rp0recrey)*(1 - py))*(rp0recrez*pz + (1 - rp0recrez)*(1 - pz))*p0re[rp0recrex + posx + 8][rp0recrey + posy + 8][rp0recrez + posz + 8][freq_dim];
              }
            }
          }
        }

        p0recre[recdim][freq_dim] = sum;
      }
    }
  }
  STOP(section0,timers)

  return 0;
}

Now, when we run the same code with MPI we get very different results:

cvx@cbox-lukedecker-extfwimpi:~$ DEVITO_LOGGING=DEBUG  DEVITO_MPI=1 OMP_NUM_THREADS=22 mpirun -n 2 -display-map  -mca btl vader,self -map-by numa:PE=24 python mfe.py 
libibverbs: Warning: couldn't open config directory '/usr/local/etc/libibverbs.d'.
 Data for JOB [51184,1] offset 0 Total slots allocated 48

 ========================   JOB MAP   ========================

 Data for node: cbox-lukedecker-extfwimpi       Num slots: 48   Max slots: 0    Num procs: 2
        Process OMPI jobid: [51184,1] App: 0 Process rank: 0 Bound: socket 0[core 0[hwt 0]], socket 0[core 1[hwt 0]], socket 0[core 2[hwt 0]], socket 0[core 3[hwt 0]], socket 0[core 4[hwt 0]], socket 0[core 5[hwt 0]], socket 0[core 6[hwt 0]], socket 0[core 7[hwt 0]], socket 0[core 8[hwt 0]], socket 0[core 9[hwt 0]], socket 0[core 10[hwt 0]], socket 0[core 11[hwt 0]], socket 0[core 12[hwt 0]], socket 0[core 13[hwt 0]], socket 0[core 14[hwt 0]], socket 0[core 15[hwt 0]], socket 0[core 16[hwt 0]], socket 0[core 17[hwt 0]], socket 0[core 18[hwt 0]], socket 0[core 19[hwt 0]], socket 0[core 20[hwt 0]], socket 0[core 21[hwt 0]], socket 0[core 22[hwt 0]], socket 0[core 23[hwt 0]]:[B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B][./././././././././././././././././././././././.]
        Process OMPI jobid: [51184,1] App: 0 Process rank: 1 Bound: socket 1[core 24[hwt 0]], socket 1[core 25[hwt 0]], socket 1[core 26[hwt 0]], socket 1[core 27[hwt 0]], socket 1[core 28[hwt 0]], socket 1[core 29[hwt 0]], socket 1[core 30[hwt 0]], socket 1[core 31[hwt 0]], socket 1[core 32[hwt 0]], socket 1[core 33[hwt 0]], socket 1[core 34[hwt 0]], socket 1[core 35[hwt 0]], socket 1[core 36[hwt 0]], socket 1[core 37[hwt 0]], socket 1[core 38[hwt 0]], socket 1[core 39[hwt 0]], socket 1[core 40[hwt 0]], socket 1[core 41[hwt 0]], socket 1[core 42[hwt 0]], socket 1[core 43[hwt 0]], socket 1[core 44[hwt 0]], socket 1[core 45[hwt 0]], socket 1[core 46[hwt 0]], socket 1[core 47[hwt 0]]:[./././././././././././././././././././././././.][B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B]

 =============================================================
libibverbs: Warning: couldn't open config directory '/usr/local/etc/libibverbs.d'.
libibverbs: Warning: couldn't open config directory '/usr/local/etc/libibverbs.d'.
Allocating host memory for frequencies(5,) [20 B]
Allocating host memory for frequencies(5,) [20 B]
Allocating host memory for src_coords(0, 3) [0 B]
Allocating host memory for src_coords(1, 3) [12 B]
Allocating host memory for p0recre_coords(6, 3) [72 B]
Allocating host memory for p0recre_coords(6, 3) [72 B]
Operator `propagator` generated in 1.26 s
  * lowering.IET: 0.61 s (48.7 %)
     * specializing.IET: 0.51 s (40.8 %)
        * make_mpi: 0.28 s (22.4 %)
  * lowering.Clusters: 0.39 s (31.2 %)
Flops reduction after symbolic optimization: [152 --> 54]
Allocating host memory for p0(3, 41, 67, 67) [2 MB]
Operator `propagator` generated in 1.26 s
  * lowering.IET: 0.61 s (48.5 %)
     * specializing.IET: 0.51 s (40.6 %)
        * make_mpi: 0.28 s (22.3 %)
  * lowering.Clusters: 0.39 s (31.0 %)
Flops reduction after symbolic optimization: [152 --> 54]
Allocating host memory for p0(3, 42, 67, 67) [2 MB]
Allocating host memory for p0re(42, 67, 67, 5) [4 MB]
Allocating host memory for p0re(41, 67, 67, 5) [4 MB]
p_src_m=0 and p_src_M=-1 might cause no iterations along Dimension p_src
Operator `propagator` fetched `/tmp/devito-jitcache-uid1000/a653349b99b6968e90b8c82ca727c2089fdab675.cpp` in 0.13 s from jit-cache
Operator `propagator` fetched `/tmp/devito-jitcache-uid1000/a653349b99b6968e90b8c82ca727c2089fdab675.cpp` in 0.21 s from jit-cache
Operator `propagator` ran in 0.05 s
Operator `propagator` ran in 0.05 s
Global performance: [OI=0.52, 28.14 GFlops/s, 0.54 GPts/s]
Global performance: [OI=0.52, 28.14 GFlops/s, 0.54 GPts/s]
Local performance:
Local performance:
  * multipass0[rank0]<> ran in 0.04 s [OI=0.52, 18.29 GFlops/s, 0.35 GPts/s]
  * multipass0[rank0]<> ran in 0.04 s [OI=0.52, 18.29 GFlops/s, 0.35 GPts/s]
    + section0 ran in 0.01 s [9.28%] 
    + section0 ran in 0.01 s [9.28%] 
    + section1 ran in 0.04 s [90.55%] 
    + section1 ran in 0.04 s [90.55%] 
  * multipass0[rank1]<> ran in 0.04 s [OI=0.52, 17.60 GFlops/s, 0.34 GPts/s]
  * multipass0[rank1]<> ran in 0.04 s [OI=0.52, 17.60 GFlops/s, 0.34 GPts/s]
    + section0 ran in 0.01 s [9.29%] 
    + section0 ran in 0.01 s [9.29%] 
    + section1 ran in 0.04 s [90.65%] 
    + section1 ran in 0.04 s [90.65%] 
  * section2[rank1]<<200,1,5,2,2,2>,<200,1,5,2,2,2>> ran in 0.01 s [OI=12.48, 0.69 GFlops/s, 0.01 GPts/s]
  * section2[rank1]<<200,1,5,2,2,2>,<200,1,5,2,2,2>> ran in 0.01 s [OI=12.48, 0.69 GFlops/s, 0.01 GPts/s]
Performance[mode=advanced] arguments: {'nthreads': 22, 'nthreads_nonaffine': 22, 'x0_blk0_size': 8, 'y0_blk0_size': 8}
Performance[mode=advanced] arguments: {'nthreads': 22, 'nthreads_nonaffine': 22, 'x0_blk0_size': 8, 'y0_blk0_size': 8}
Operator `interpolator` generated in 0.27 s
  * lowering.Clusters: 0.11 s (40.8 %)
     * specializing.Clusters: 0.08 s (29.7 %)
  * lowering.IET: 0.11 s (40.8 %)
     * specializing.IET: 0.08 s (29.7 %)
Flops reduction after symbolic optimization: [36 --> 21]
Operator `interpolator` generated in 0.27 s
  * lowering.Clusters: 0.11 s (40.8 %)
     * specializing.Clusters: 0.08 s (29.7 %)
  * lowering.IET: 0.11 s (40.8 %)
     * specializing.IET: 0.08 s (29.7 %)
Flops reduction after symbolic optimization: [36 --> 21]
Allocating host memory for p0recre(6, 5) [120 B]
Allocating host memory for p0recre(6, 5) [120 B]
recdim_m=0 and recdim_M=-1 might cause no iterations along Dimension recdim
recdim_m=0 and recdim_M=-1 might cause no iterations along Dimension recdim
Operator `interpolator` fetched `/tmp/devito-jitcache-uid1000/c3edbe514ed93e1118c8a9b2ee38bcf004de8df5.cpp` in 0.02 s from jit-cache
Operator `interpolator` fetched `/tmp/devito-jitcache-uid1000/c3edbe514ed93e1118c8a9b2ee38bcf004de8df5.cpp` in 0.02 s from jit-cache
Operator `interpolator` ran in 0.01 s
Operator `interpolator` ran in 0.01 s
Global performance: [OI=3.05, 0.01 GFlops/s]
Global performance: [OI=3.05, 0.01 GFlops/s]
Local performance:
Local performance:
  * section1[rank0]<<12,5>,<12,5,2,2,2>> ran in 0.01 s [OI=3.05, 0.04 GFlops/s, 0.00 GPts/s]
  * section1[rank0]<<12,5>,<12,5,2,2,2>> ran in 0.01 s [OI=3.05, 0.04 GFlops/s, 0.00 GPts/s]
Performance[mode=advanced] arguments: {'nthreads_nonaffine': 22}
Performance[mode=advanced] arguments: {'nthreads_nonaffine': 22}
[[7.8738990e-14 1.2676886e-13 1.4294368e-13 1.4095608e-13 1.3165730e-13]
 [7.8738990e-14 1.2676886e-13 1.4294368e-13 1.4095608e-13 1.3165730e-13]
 [7.8738990e-14 1.2676886e-13 1.4294368e-13 1.4095608e-13 1.3165730e-13]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]]
[[7.8738990e-14 1.2676886e-13 1.4294368e-13 1.4095608e-13 1.3165730e-13]
 [7.8738990e-14 1.2676886e-13 1.4294368e-13 1.4095608e-13 1.3165730e-13]
 [7.8738990e-14 1.2676886e-13 1.4294368e-13 1.4095608e-13 1.3165730e-13]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]]

Here is the generated MPI propagator code:

#define _POSIX_C_SOURCE 200809L
#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);
#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;
#define MIN(a,b) (((a) < (b)) ? (a) : (b))

#include "stdlib.h"
#include "math.h"
#include "sys/time.h"
#include "xmmintrin.h"
#include "pmmintrin.h"
#include "mpi.h"
#include "omp.h"

struct dataobj
{
  void *restrict data;
  unsigned long * size;
  unsigned long * npsize;
  unsigned long * dsize;
  int * hsize;
  int * hofs;
  int * oofs;
  void * dmap;
} ;

struct neighborhood
{
  int lll, llc, llr, lcl, lcc, lcr, lrl, lrc, lrr;
  int cll, clc, clr, ccl, ccc, ccr, crl, crc, crr;
  int rll, rlc, rlr, rcl, rcc, rcr, rrl, rrc, rrr;
} ;

struct profiler
{
  double multipass0;
  double section2;
  double section0;
  double section1;
} ;

extern "C" int propagator(struct dataobj *restrict frequencies_vec, struct dataobj *restrict p0_vec, struct dataobj *restrict p0re_vec, struct dataobj *restrict src_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int p_src_M, const int p_src_m, const int time_M, const int time_m, const int x0_blk0_size, const int y0_blk0_size, MPI_Comm comm, struct neighborhood * nb, const int nthreads, const int nthreads_nonaffine, struct profiler * timers);

static void sendrecv0(struct dataobj *restrict p0_vec, const int x_size, const int y_size, const int z_size, int ogtime, int ogx, int ogy, int ogz, int ostime, int osx, int osy, int osz, int fromrank, int torank, MPI_Comm comm, const int nthreads);
static void haloupdate0(struct dataobj *restrict p0_vec, MPI_Comm comm, struct neighborhood * nb, int otime, const int nthreads);
static void gather0(float *restrict buf_vec, int bx_size, int by_size, int bz_size, struct dataobj *restrict p0_vec, const int otime, const int ox, const int oy, const int oz, const int nthreads);
static void scatter0(float *restrict buf_vec, int bx_size, int by_size, int bz_size, struct dataobj *restrict p0_vec, const int otime, const int ox, const int oy, const int oz, const int nthreads);

int propagator(struct dataobj *restrict frequencies_vec, struct dataobj *restrict p0_vec, struct dataobj *restrict p0re_vec, struct dataobj *restrict src_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int p_src_M, const int p_src_m, const int time_M, const int time_m, const int x0_blk0_size, const int y0_blk0_size, MPI_Comm comm, struct neighborhood * nb, const int nthreads, const int nthreads_nonaffine, struct profiler * timers)
{
  float *restrict r1_vec __attribute__ ((aligned (64)));
  posix_memalign((void**)(&r1_vec),64,5*sizeof(float));

  float (*restrict frequencies) __attribute__ ((aligned (64))) = (float (*)) frequencies_vec->data;
  float (*restrict p0)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]]) p0_vec->data;
  float (*restrict p0re)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]]) p0re_vec->data;
  float (*restrict r1) __attribute__ ((aligned (64))) = (float (*)) r1_vec;
  float (*restrict src_coords)[src_coords_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[src_coords_vec->size[1]]) src_coords_vec->data;

  /* Flush denormal numbers to zero in hardware */
  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);

  float r2 = 1.0F/(h_x*h_x);
  float r3 = 1.0F/(h_y*h_y);
  float r4 = 1.0F/(h_z*h_z);

  for (int time = time_m, t0 = (time)%(3), t1 = (time + 2)%(3), t2 = (time + 1)%(3); time <= time_M; time += 1, t0 = (time)%(3), t1 = (time + 2)%(3), t2 = (time + 1)%(3))
  {
    START(multipass0)
    START(section0)
    #pragma omp parallel num_threads(nthreads)
    {
      #pragma omp for schedule(static,1)
      for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
      {
        r1[freq_dim] = sin(6.28318e-2F*time*frequencies[freq_dim]);
      }
    }
    STOP(section0,timers)

    START(section1)
    haloupdate0(p0_vec,comm,nb,t0,nthreads);
    #pragma omp parallel num_threads(nthreads)
    {
      #pragma omp for collapse(2) schedule(dynamic,1)
      for (int x0_blk0 = x_m; x0_blk0 <= x_M; x0_blk0 += x0_blk0_size)
      {
        for (int y0_blk0 = y_m; y0_blk0 <= y_M; y0_blk0 += y0_blk0_size)
        {
          for (int x = x0_blk0; x <= MIN(x_M, x0_blk0 + x0_blk0_size - 1); x += 1)
          {
            for (int y = y0_blk0; y <= MIN(y_M, y0_blk0 + y0_blk0_size - 1); y += 1)
            {
              for (int z = z_m; z <= z_M; z += 1)
              {
                float r5 = 1.42361111100763e-5F*p0[t0][x + 8][y + 8][z + 8];
                p0[t2][x + 8][y + 8][z + 8] = r2*(r5 + 8.9285714284415e-9F*(p0[t0][x + 4][y + 8][z + 8] + p0[t0][x + 12][y + 8][z + 8]) + (-1.26984126982279e-7F)*(p0[t0][x + 5][y + 8][z + 8] + p0[t0][x + 11][y + 8][z + 8]) + 1.00000000005821e-6F*(p0[t0][x + 6][y + 8][z + 8] + p0[t0][x + 10][y + 8][z + 8]) + (-8.00000000046566e-6F)*(p0[t0][x + 7][y + 8][z + 8] + p0[t0][x + 9][y + 8][z + 8])) + r3*(r5 + 8.9285714284415e-9F*(p0[t0][x + 8][y + 4][z + 8] + p0[t0][x + 8][y + 12][z + 8]) + (-1.26984126982279e-7F)*(p0[t0][x + 8][y + 5][z + 8] + p0[t0][x + 8][y + 11][z + 8]) + 1.00000000005821e-6F*(p0[t0][x + 8][y + 6][z + 8] + p0[t0][x + 8][y + 10][z + 8]) + (-8.00000000046566e-6F)*(p0[t0][x + 8][y + 7][z + 8] + p0[t0][x + 8][y + 9][z + 8])) + r4*(r5 + 8.9285714284415e-9F*(p0[t0][x + 8][y + 8][z + 4] + p0[t0][x + 8][y + 8][z + 12]) + (-1.26984126982279e-7F)*(p0[t0][x + 8][y + 8][z + 5] + p0[t0][x + 8][y + 8][z + 11]) + 1.00000000005821e-6F*(p0[t0][x + 8][y + 8][z + 6] + p0[t0][x + 8][y + 8][z + 10]) + (-8.00000000046566e-6F)*(p0[t0][x + 8][y + 8][z + 7] + p0[t0][x + 8][y + 8][z + 9])) + p0[t1][x + 8][y + 8][z + 8];

                #pragma omp simd aligned(p0,p0re:64)
                for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
                {
                  p0re[x + 8][y + 8][z + 8][freq_dim] = r1[freq_dim]*p0[t0][x + 8][y + 8][z + 8] + p0re[x + 8][y + 8][z + 8][freq_dim];
                }
              }
            }
          }
        }
      }
    }
    STOP(section1,timers)
    STOP(multipass0,timers)

    START(section2)
    #pragma omp parallel num_threads(nthreads_nonaffine)
    {
      int chunk_size = (int)(fmax(1, (1.0F/3.0F)*(p_src_M - p_src_m + 1)/nthreads_nonaffine));
      #pragma omp for schedule(dynamic,chunk_size)
      for (int p_src = p_src_m; p_src <= p_src_M; p_src += 1)
      {
        for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
        {
          for (int rsrcx = 0; rsrcx <= 1; rsrcx += 1)
          {
            for (int rsrcy = 0; rsrcy <= 1; rsrcy += 1)
            {
              for (int rsrcz = 0; rsrcz <= 1; rsrcz += 1)
              {
                int posx = (int)(floor((-o_x + src_coords[p_src][0])/h_x));
                int posy = (int)(floor((-o_y + src_coords[p_src][1])/h_y));
                int posz = (int)(floor((-o_z + src_coords[p_src][2])/h_z));
                float px = -floor((-o_x + src_coords[p_src][0])/h_x) + (-o_x + src_coords[p_src][0])/h_x;
                float py = -floor((-o_y + src_coords[p_src][1])/h_y) + (-o_y + src_coords[p_src][1])/h_y;
                float pz = -floor((-o_z + src_coords[p_src][2])/h_z) + (-o_z + src_coords[p_src][2])/h_z;
                if (rsrcx + posx >= x_m - 1 && rsrcy + posy >= y_m - 1 && rsrcz + posz >= z_m - 1 && rsrcx + posx <= x_M + 1 && rsrcy + posy <= y_M + 1 && rsrcz + posz <= z_M + 1)
                {
                  float r0 = (1.0F/100000.0F)*(rsrcx*px + (1 - rsrcx)*(1 - px))*(rsrcy*py + (1 - rsrcy)*(1 - py))*(rsrcz*pz + (1 - rsrcz)*(1 - pz))*sin(6.28318e-2F*time*frequencies[freq_dim]);
                  #pragma omp atomic update
                  p0[t2][rsrcx + posx + 8][rsrcy + posy + 8][rsrcz + posz + 8] += r0;
                }
              }
            }
          }
        }
      }
    }
    STOP(section2,timers)
  }

  free(r1_vec);

  return 0;
}

static void sendrecv0(struct dataobj *restrict p0_vec, const int x_size, const int y_size, const int z_size, int ogtime, int ogx, int ogy, int ogz, int ostime, int osx, int osy, int osz, int fromrank, int torank, MPI_Comm comm, const int nthreads)
{
  float *restrict bufg_vec __attribute__ ((aligned (64)));
  posix_memalign((void**)(&bufg_vec),64,x_size*y_size*z_size*sizeof(float));
  float *restrict bufs_vec __attribute__ ((aligned (64)));
  posix_memalign((void**)(&bufs_vec),64,x_size*y_size*z_size*sizeof(float));

  MPI_Request rrecv;
  MPI_Request rsend;

  MPI_Irecv(bufs_vec,x_size*y_size*z_size,MPI_FLOAT,fromrank,13,comm,&(rrecv));
  if (torank != MPI_PROC_NULL)
  {
    gather0(bufg_vec,x_size,y_size,z_size,p0_vec,ogtime,ogx,ogy,ogz,nthreads);
  }
  MPI_Isend(bufg_vec,x_size*y_size*z_size,MPI_FLOAT,torank,13,comm,&(rsend));
  MPI_Wait(&(rsend),MPI_STATUS_IGNORE);
  MPI_Wait(&(rrecv),MPI_STATUS_IGNORE);
  if (fromrank != MPI_PROC_NULL)
  {
    scatter0(bufs_vec,x_size,y_size,z_size,p0_vec,ostime,osx,osy,osz,nthreads);
  }

  free(bufg_vec);
  free(bufs_vec);
}

static void haloupdate0(struct dataobj *restrict p0_vec, MPI_Comm comm, struct neighborhood * nb, int otime, const int nthreads)
{
  sendrecv0(p0_vec,p0_vec->hsize[3],p0_vec->npsize[2],p0_vec->npsize[3],otime,p0_vec->oofs[2],p0_vec->hofs[4],p0_vec->hofs[6],otime,p0_vec->hofs[3],p0_vec->hofs[4],p0_vec->hofs[6],nb->rcc,nb->lcc,comm,nthreads);
  sendrecv0(p0_vec,p0_vec->hsize[2],p0_vec->npsize[2],p0_vec->npsize[3],otime,p0_vec->oofs[3],p0_vec->hofs[4],p0_vec->hofs[6],otime,p0_vec->hofs[2],p0_vec->hofs[4],p0_vec->hofs[6],nb->lcc,nb->rcc,comm,nthreads);
  sendrecv0(p0_vec,p0_vec->npsize[1],p0_vec->hsize[5],p0_vec->npsize[3],otime,p0_vec->hofs[2],p0_vec->oofs[4],p0_vec->hofs[6],otime,p0_vec->hofs[2],p0_vec->hofs[5],p0_vec->hofs[6],nb->crc,nb->clc,comm,nthreads);
  sendrecv0(p0_vec,p0_vec->npsize[1],p0_vec->hsize[4],p0_vec->npsize[3],otime,p0_vec->hofs[2],p0_vec->oofs[5],p0_vec->hofs[6],otime,p0_vec->hofs[2],p0_vec->hofs[4],p0_vec->hofs[6],nb->clc,nb->crc,comm,nthreads);
  sendrecv0(p0_vec,p0_vec->npsize[1],p0_vec->npsize[2],p0_vec->hsize[7],otime,p0_vec->hofs[2],p0_vec->hofs[4],p0_vec->oofs[6],otime,p0_vec->hofs[2],p0_vec->hofs[4],p0_vec->hofs[7],nb->ccr,nb->ccl,comm,nthreads);
  sendrecv0(p0_vec,p0_vec->npsize[1],p0_vec->npsize[2],p0_vec->hsize[6],otime,p0_vec->hofs[2],p0_vec->hofs[4],p0_vec->oofs[7],otime,p0_vec->hofs[2],p0_vec->hofs[4],p0_vec->hofs[6],nb->ccl,nb->ccr,comm,nthreads);
}

static void gather0(float *restrict buf_vec, int bx_size, int by_size, int bz_size, struct dataobj *restrict p0_vec, const int otime, const int ox, const int oy, const int oz, const int nthreads)
{
  float (*restrict buf)[bx_size][by_size][bz_size] __attribute__ ((aligned (64))) = (float (*)[bx_size][by_size][bz_size]) buf_vec;
  float (*restrict p0)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]]) p0_vec->data;

  const int x_m = 0;
  const int y_m = 0;
  const int z_m = 0;
  const int x_M = bx_size - 1;
  const int y_M = by_size - 1;
  const int z_M = bz_size - 1;
  #pragma omp parallel num_threads(nthreads)
  {
    #pragma omp for collapse(2) schedule(static,1)
    for (int x = x_m; x <= x_M; x += 1)
    {
      for (int y = y_m; y <= y_M; y += 1)
      {
        #pragma omp simd aligned(p0:64)
        for (int z = z_m; z <= z_M; z += 1)
        {
          buf[0][x][y][z] = p0[otime][x + ox][y + oy][z + oz];
        }
      }
    }
  }
}

static void scatter0(float *restrict buf_vec, int bx_size, int by_size, int bz_size, struct dataobj *restrict p0_vec, const int otime, const int ox, const int oy, const int oz, const int nthreads)
{
  float (*restrict buf)[bx_size][by_size][bz_size] __attribute__ ((aligned (64))) = (float (*)[bx_size][by_size][bz_size]) buf_vec;
  float (*restrict p0)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]]) p0_vec->data;

  const int x_m = 0;
  const int y_m = 0;
  const int z_m = 0;
  const int x_M = bx_size - 1;
  const int y_M = by_size - 1;
  const int z_M = bz_size - 1;
  #pragma omp parallel num_threads(nthreads)
  {
    #pragma omp for collapse(2) schedule(static,1)
    for (int x = x_m; x <= x_M; x += 1)
    {
      for (int y = y_m; y <= y_M; y += 1)
      {
        #pragma omp simd aligned(p0:64)
        for (int z = z_m; z <= z_M; z += 1)
        {
          p0[otime][x + ox][y + oy][z + oz] = buf[0][x][y][z];
        }
      }
    }
  }
}

And the generated MPI interpolator code:

#define _POSIX_C_SOURCE 200809L
#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);
#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;

#include "stdlib.h"
#include "math.h"
#include "sys/time.h"
#include "xmmintrin.h"
#include "pmmintrin.h"
#include "mpi.h"
#include "omp.h"

struct dataobj
{
  void *restrict data;
  unsigned long * size;
  unsigned long * npsize;
  unsigned long * dsize;
  int * hsize;
  int * hofs;
  int * oofs;
  void * dmap;
} ;

struct profiler
{
  double section0;
  double section1;
} ;

extern "C" int interpolator(struct dataobj *restrict p0re_vec, struct dataobj *restrict p0recre_vec, struct dataobj *restrict p0recre_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int recdim_M, const int recdim_m, const int nthreads_nonaffine, struct profiler * timers);


int interpolator(struct dataobj *restrict p0re_vec, struct dataobj *restrict p0recre_vec, struct dataobj *restrict p0recre_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int recdim_M, const int recdim_m, const int nthreads_nonaffine, struct profiler * timers)
{
  float (*restrict p0re)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]]) p0re_vec->data;
  float (*restrict p0recre)[p0recre_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[p0recre_vec->size[1]]) p0recre_vec->data;
  float (*restrict p0recre_coords)[p0recre_coords_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[p0recre_coords_vec->size[1]]) p0recre_coords_vec->data;

  /* Flush denormal numbers to zero in hardware */
  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);

  float r0 = 1.0F/h_x;
  float r1 = 1.0F/h_y;
  float r2 = 1.0F/h_z;
  START(section0)
  for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
  {
  }
  STOP(section0,timers)

  START(section1)
  #pragma omp parallel num_threads(nthreads_nonaffine)
  {
    int chunk_size = (int)(fmax(1, (1.0F/3.0F)*(recdim_M - recdim_m + 1)/nthreads_nonaffine));
    #pragma omp for schedule(dynamic,chunk_size)
    for (int recdim = recdim_m; recdim <= recdim_M; recdim += 1)
    {
      for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
      {
        float r3 = floor(r0*(-o_x + p0recre_coords[recdim][0]));
        int posx = (int)r3;
        float r4 = floor(r1*(-o_y + p0recre_coords[recdim][1]));
        int posy = (int)r4;
        float r5 = floor(r2*(-o_z + p0recre_coords[recdim][2]));
        int posz = (int)r5;
        float px = r0*(-o_x + p0recre_coords[recdim][0]) - r3;
        float py = r1*(-o_y + p0recre_coords[recdim][1]) - r4;
        float pz = r2*(-o_z + p0recre_coords[recdim][2]) - r5;
        float sum = 0.0F;

        for (int rp0recrex = 0; rp0recrex <= 1; rp0recrex += 1)
        {
          for (int rp0recrey = 0; rp0recrey <= 1; rp0recrey += 1)
          {
            for (int rp0recrez = 0; rp0recrez <= 1; rp0recrez += 1)
            {
              if (rp0recrex + posx >= x_m - 1 && rp0recrey + posy >= y_m - 1 && rp0recrez + posz >= z_m - 1 && rp0recrex + posx <= x_M + 1 && rp0recrey + posy <= y_M + 1 && rp0recrez + posz <= z_M + 1)
              {
                sum += (rp0recrex*px + (1 - rp0recrex)*(1 - px))*(rp0recrey*py + (1 - rp0recrey)*(1 - py))*(rp0recrez*pz + (1 - rp0recrez)*(1 - pz))*p0re[rp0recrex + posx + 8][rp0recrey + posy + 8][rp0recrez + posz + 8][freq_dim];
              }
            }
          }
        }

        p0recre[recdim][freq_dim] = sum;
      }
    }
  }
  STOP(section1,timers)

  return 0;
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions