Closed
Description
Consider the following MFE which simulates wave propagation and projection into a frequency basis followed by frequency extraction:
import devito
import numpy as np
shape = (51,51,51)
nfreq = 5
nrec = 6
space_order = 8
dt = 0.01
time_M = 200
grid = devito.Grid(shape=shape, extent=(1,1,1))
freq_dim = devito.DefaultDimension(name="freq_dim", default_value=nfreq)
f = devito.Function(name="frequencies", dimensions=(freq_dim,), shape=(nfreq,), grid=grid)
f.data[:] = np.arange(1,nfreq+1)
p0 = devito.TimeFunction(name="p0", grid=grid, time_order=2, space_order=space_order)
p0re = devito.Function(name="p0re", grid=grid, dimensions=(*grid.dimensions, freq_dim), shape=(*shape, nfreq), space_order=space_order)
src = devito.SparseTimeFunction(name="src", grid=grid, nt=time_M, npoint=1, coordinates=np.array([0.5,0.5,0.5]))
class CoordSlowSparseFunction(devito.SparseFunction):
_sparse_position = 0
recdim = devito.Dimension(name="recdim")
p0recre = CoordSlowSparseFunction(name="p0recre", grid=grid, dimensions=(recdim, freq_dim), shape=(nrec, nfreq), npoint=nrec, coordinates=np.array([0.4,0.4,0.4]))
wp_eq = devito.Eq(p0.forward, -5/1000000*(p0.dx2 + p0.dy2 + p0.dz2) + p0.backward)
inj_eq = src.inject(field=p0.forward, expr=devito.sin(2*3.14159*f*grid.time_dim*dt)/100000)
proj_eq_re = devito.Eq(p0re, p0re + devito.sin(2*3.14159*f*grid.time_dim*dt)*p0)
wp_proj_op = devito.Operator([wp_eq, inj_eq, proj_eq_re], name="propagator")
wp_proj_op.apply(time_M=time_M)
interp_eq_re = p0recre.interpolate(expr=p0re)
interp_op = devito.Operator([interp_eq_re], name="interpolator")
interp_op.apply()
print(p0recre.data)
Running this operator without MPI we have the following behavior:
cvx@cbox-lukedecker-extfwimpi:~$ DEVITO_LOGGING=DEBUG python mfe.py
Allocating host memory for frequencies(5,) [20 B]
Allocating host memory for src_coords(1, 3) [12 B]
Allocating host memory for p0recre_coords(6, 3) [72 B]
Operator `propagator` generated in 0.88 s
* lowering.Clusters: 0.39 s (44.5 %)
* specializing.Clusters: 0.24 s (27.4 %)
* lowering.IET: 0.23 s (26.3 %)
* lowering.Expressions: 0.22 s (25.2 %)
Flops reduction after symbolic optimization: [152 --> 54]
Allocating host memory for p0(3, 67, 67, 67) [3 MB]
Allocating host memory for p0re(67, 67, 67, 5) [6 MB]
Operator `propagator` fetched `/tmp/devito-jitcache-uid1000/94c579b990e202a56a5c59b81fe83ce007d2aff6.cpp` in 0.11 s from jit-cache
Operator `propagator` ran in 0.10 s
Global performance: [OI=0.55, 14.07 GFlops/s, 0.27 GPts/s]
Local performance:
* multipass0<> ran in 0.09 s [OI=0.55, 16.70 GFlops/s, 0.32 GPts/s]
+ section0 ran in 0.03 s [24.62%]
+ section1 ran in 0.07 s [75.36%]
* section2<<200,1,5,2,2,2>,<200,1,5,2,2,2>> ran in 0.01 s [OI=12.48, 0.13 GFlops/s, 0.01 GPts/s]
Performance[mode=advanced] arguments: {'nthreads': 48, 'nthreads_nonaffine': 48, 'x0_blk0_size': 8, 'y0_blk0_size': 8}
Operator `interpolator` generated in 0.30 s
* lowering.Clusters: 0.16 s (53.4 %)
* specializing.Clusters: 0.12 s (40.1 %)
* cire: 0.08 s (26.7 %)
* lowering.IET: 0.09 s (30.1 %)
* specializing.IET: 0.07 s (23.4 %)
Flops reduction after symbolic optimization: [36 --> 21]
Allocating host memory for p0recre(6, 5) [120 B]
Operator `interpolator` fetched `/tmp/devito-jitcache-uid1000/3129fe9dfcd34d4a03a2a7eaab3ade05ff9aa4be.cpp` in 0.02 s from jit-cache
Operator `interpolator` ran in 0.01 s
Global performance: [OI=3.05, 0.01 GFlops/s]
Local performance:
* section0<<6,5>,<6,5,2,2,2>> ran in 0.01 s [OI=3.05, 0.01 GFlops/s, 0.00 GPts/s]
Performance[mode=advanced] arguments: {'nthreads_nonaffine': 48}
[[70.95639 96.57933 94.03625 83.82484 73.3117 ]
[70.95639 96.57933 94.03625 83.82484 73.3117 ]
[70.95639 96.57933 94.03625 83.82484 73.3117 ]
[70.95639 96.57933 94.03625 83.82484 73.3117 ]
[70.95639 96.57933 94.03625 83.82484 73.3117 ]
[70.95639 96.57933 94.03625 83.82484 73.3117 ]]
With the following generated c code for the propagator:
#define _POSIX_C_SOURCE 200809L
#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);
#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;
#define MIN(a,b) (((a) < (b)) ? (a) : (b))
#include "stdlib.h"
#include "math.h"
#include "sys/time.h"
#include "xmmintrin.h"
#include "pmmintrin.h"
#include "omp.h"
struct dataobj
{
void *restrict data;
unsigned long * size;
unsigned long * npsize;
unsigned long * dsize;
int * hsize;
int * hofs;
int * oofs;
void * dmap;
} ;
struct profiler
{
double multipass0;
double section2;
double section0;
double section1;
} ;
extern "C" int propagator(struct dataobj *restrict frequencies_vec, struct dataobj *restrict p0_vec, struct dataobj *restrict p0re_vec, struct dataobj *restrict src_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int p_src_M, const int p_src_m, const int time_M, const int time_m, const int x0_blk0_size, const int y0_blk0_size, const int nthreads, const int nthreads_nonaffine, struct profiler * timers);
int propagator(struct dataobj *restrict frequencies_vec, struct dataobj *restrict p0_vec, struct dataobj *restrict p0re_vec, struct dataobj *restrict src_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int p_src_M, const int p_src_m, const int time_M, const int time_m, const int x0_blk0_size, const int y0_blk0_size, const int nthreads, const int nthreads_nonaffine, struct profiler * timers)
{
float *restrict r1_vec __attribute__ ((aligned (64)));
posix_memalign((void**)(&r1_vec),64,5*sizeof(float));
float (*restrict frequencies) __attribute__ ((aligned (64))) = (float (*)) frequencies_vec->data;
float (*restrict p0)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]]) p0_vec->data;
float (*restrict p0re)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]]) p0re_vec->data;
float (*restrict r1) __attribute__ ((aligned (64))) = (float (*)) r1_vec;
float (*restrict src_coords)[src_coords_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[src_coords_vec->size[1]]) src_coords_vec->data;
/* Flush denormal numbers to zero in hardware */
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
float r2 = 1.0F/(h_x*h_x);
float r3 = 1.0F/(h_y*h_y);
float r4 = 1.0F/(h_z*h_z);
for (int time = time_m, t0 = (time)%(3), t1 = (time + 2)%(3), t2 = (time + 1)%(3); time <= time_M; time += 1, t0 = (time)%(3), t1 = (time + 2)%(3), t2 = (time + 1)%(3))
{
START(multipass0)
START(section0)
#pragma omp parallel num_threads(nthreads)
{
#pragma omp for schedule(static,1)
for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
{
r1[freq_dim] = sin(6.28318e-2F*time*frequencies[freq_dim]);
}
}
STOP(section0,timers)
START(section1)
#pragma omp parallel num_threads(nthreads)
{
#pragma omp for collapse(2) schedule(dynamic,1)
for (int x0_blk0 = x_m; x0_blk0 <= x_M; x0_blk0 += x0_blk0_size)
{
for (int y0_blk0 = y_m; y0_blk0 <= y_M; y0_blk0 += y0_blk0_size)
{
for (int x = x0_blk0; x <= MIN(x_M, x0_blk0 + x0_blk0_size - 1); x += 1)
{
for (int y = y0_blk0; y <= MIN(y_M, y0_blk0 + y0_blk0_size - 1); y += 1)
{
for (int z = z_m; z <= z_M; z += 1)
{
float r5 = 1.42361111100763e-5F*p0[t0][x + 8][y + 8][z + 8];
p0[t2][x + 8][y + 8][z + 8] = r2*(r5 + 8.9285714284415e-9F*(p0[t0][x + 4][y + 8][z + 8] + p0[t0][x + 12][y + 8][z + 8]) + (-1.26984126982279e-7F)*(p0[t0][x + 5][y + 8][z + 8] + p0[t0][x + 11][y + 8][z + 8]) + 1.00000000005821e-6F*(p0[t0][x + 6][y + 8][z + 8] + p0[t0][x + 10][y + 8][z + 8]) + (-8.00000000046566e-6F)*(p0[t0][x + 7][y + 8][z + 8] + p0[t0][x + 9][y + 8][z + 8])) + r3*(r5 + 8.9285714284415e-9F*(p0[t0][x + 8][y + 4][z + 8] + p0[t0][x + 8][y + 12][z + 8]) + (-1.26984126982279e-7F)*(p0[t0][x + 8][y + 5][z + 8] + p0[t0][x + 8][y + 11][z + 8]) + 1.00000000005821e-6F*(p0[t0][x + 8][y + 6][z + 8] + p0[t0][x + 8][y + 10][z + 8]) + (-8.00000000046566e-6F)*(p0[t0][x + 8][y + 7][z + 8] + p0[t0][x + 8][y + 9][z + 8])) + r4*(r5 + 8.9285714284415e-9F*(p0[t0][x + 8][y + 8][z + 4] + p0[t0][x + 8][y + 8][z + 12]) + (-1.26984126982279e-7F)*(p0[t0][x + 8][y + 8][z + 5] + p0[t0][x + 8][y + 8][z + 11]) + 1.00000000005821e-6F*(p0[t0][x + 8][y + 8][z + 6] + p0[t0][x + 8][y + 8][z + 10]) + (-8.00000000046566e-6F)*(p0[t0][x + 8][y + 8][z + 7] + p0[t0][x + 8][y + 8][z + 9])) + p0[t1][x + 8][y + 8][z + 8];
#pragma omp simd aligned(p0,p0re:64)
for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
{
p0re[x + 8][y + 8][z + 8][freq_dim] = r1[freq_dim]*p0[t0][x + 8][y + 8][z + 8] + p0re[x + 8][y + 8][z + 8][freq_dim];
}
}
}
}
}
}
}
STOP(section1,timers)
STOP(multipass0,timers)
START(section2)
#pragma omp parallel num_threads(nthreads_nonaffine)
{
int chunk_size = (int)(fmax(1, (1.0F/3.0F)*(p_src_M - p_src_m + 1)/nthreads_nonaffine));
#pragma omp for schedule(dynamic,chunk_size)
for (int p_src = p_src_m; p_src <= p_src_M; p_src += 1)
{
for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
{
for (int rsrcx = 0; rsrcx <= 1; rsrcx += 1)
{
for (int rsrcy = 0; rsrcy <= 1; rsrcy += 1)
{
for (int rsrcz = 0; rsrcz <= 1; rsrcz += 1)
{
int posx = (int)(floor((-o_x + src_coords[p_src][0])/h_x));
int posy = (int)(floor((-o_y + src_coords[p_src][1])/h_y));
int posz = (int)(floor((-o_z + src_coords[p_src][2])/h_z));
float px = -floor((-o_x + src_coords[p_src][0])/h_x) + (-o_x + src_coords[p_src][0])/h_x;
float py = -floor((-o_y + src_coords[p_src][1])/h_y) + (-o_y + src_coords[p_src][1])/h_y;
float pz = -floor((-o_z + src_coords[p_src][2])/h_z) + (-o_z + src_coords[p_src][2])/h_z;
if (rsrcx + posx >= x_m - 1 && rsrcy + posy >= y_m - 1 && rsrcz + posz >= z_m - 1 && rsrcx + posx <= x_M + 1 && rsrcy + posy <= y_M + 1 && rsrcz + posz <= z_M + 1)
{
float r0 = (1.0F/100000.0F)*(rsrcx*px + (1 - rsrcx)*(1 - px))*(rsrcy*py + (1 - rsrcy)*(1 - py))*(rsrcz*pz + (1 - rsrcz)*(1 - pz))*sin(6.28318e-2F*time*frequencies[freq_dim]);
#pragma omp atomic update
p0[t2][rsrcx + posx + 8][rsrcy + posy + 8][rsrcz + posz + 8] += r0;
}
}
}
}
}
}
}
STOP(section2,timers)
}
free(r1_vec);
return 0;
}
And the interpolator:
#define _POSIX_C_SOURCE 200809L
#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);
#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;
#include "stdlib.h"
#include "math.h"
#include "sys/time.h"
#include "xmmintrin.h"
#include "pmmintrin.h"
#include "omp.h"
struct dataobj
{
void *restrict data;
unsigned long * size;
unsigned long * npsize;
unsigned long * dsize;
int * hsize;
int * hofs;
int * oofs;
void * dmap;
} ;
struct profiler
{
double section0;
} ;
extern "C" int interpolator(struct dataobj *restrict p0re_vec, struct dataobj *restrict p0recre_vec, struct dataobj *restrict p0recre_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int recdim_M, const int recdim_m, const int nthreads_nonaffine, struct profiler * timers);
int interpolator(struct dataobj *restrict p0re_vec, struct dataobj *restrict p0recre_vec, struct dataobj *restrict p0recre_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int recdim_M, const int recdim_m, const int nthreads_nonaffine, struct profiler * timers)
{
float (*restrict p0re)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]]) p0re_vec->data;
float (*restrict p0recre)[p0recre_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[p0recre_vec->size[1]]) p0recre_vec->data;
float (*restrict p0recre_coords)[p0recre_coords_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[p0recre_coords_vec->size[1]]) p0recre_coords_vec->data;
/* Flush denormal numbers to zero in hardware */
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
float r0 = 1.0F/h_x;
float r1 = 1.0F/h_y;
float r2 = 1.0F/h_z;
START(section0)
#pragma omp parallel num_threads(nthreads_nonaffine)
{
int chunk_size = (int)(fmax(1, (1.0F/3.0F)*(recdim_M - recdim_m + 1)/nthreads_nonaffine));
#pragma omp for schedule(dynamic,chunk_size)
for (int recdim = recdim_m; recdim <= recdim_M; recdim += 1)
{
for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
{
float r3 = floor(r0*(-o_x + p0recre_coords[recdim][0]));
int posx = (int)r3;
float r4 = floor(r1*(-o_y + p0recre_coords[recdim][1]));
int posy = (int)r4;
float r5 = floor(r2*(-o_z + p0recre_coords[recdim][2]));
int posz = (int)r5;
float px = r0*(-o_x + p0recre_coords[recdim][0]) - r3;
float py = r1*(-o_y + p0recre_coords[recdim][1]) - r4;
float pz = r2*(-o_z + p0recre_coords[recdim][2]) - r5;
float sum = 0.0F;
for (int rp0recrex = 0; rp0recrex <= 1; rp0recrex += 1)
{
for (int rp0recrey = 0; rp0recrey <= 1; rp0recrey += 1)
{
for (int rp0recrez = 0; rp0recrez <= 1; rp0recrez += 1)
{
if (rp0recrex + posx >= x_m - 1 && rp0recrey + posy >= y_m - 1 && rp0recrez + posz >= z_m - 1 && rp0recrex + posx <= x_M + 1 && rp0recrey + posy <= y_M + 1 && rp0recrez + posz <= z_M + 1)
{
sum += (rp0recrex*px + (1 - rp0recrex)*(1 - px))*(rp0recrey*py + (1 - rp0recrey)*(1 - py))*(rp0recrez*pz + (1 - rp0recrez)*(1 - pz))*p0re[rp0recrex + posx + 8][rp0recrey + posy + 8][rp0recrez + posz + 8][freq_dim];
}
}
}
}
p0recre[recdim][freq_dim] = sum;
}
}
}
STOP(section0,timers)
return 0;
}
Now, when we run the same code with MPI we get very different results:
cvx@cbox-lukedecker-extfwimpi:~$ DEVITO_LOGGING=DEBUG DEVITO_MPI=1 OMP_NUM_THREADS=22 mpirun -n 2 -display-map -mca btl vader,self -map-by numa:PE=24 python mfe.py
libibverbs: Warning: couldn't open config directory '/usr/local/etc/libibverbs.d'.
Data for JOB [51184,1] offset 0 Total slots allocated 48
======================== JOB MAP ========================
Data for node: cbox-lukedecker-extfwimpi Num slots: 48 Max slots: 0 Num procs: 2
Process OMPI jobid: [51184,1] App: 0 Process rank: 0 Bound: socket 0[core 0[hwt 0]], socket 0[core 1[hwt 0]], socket 0[core 2[hwt 0]], socket 0[core 3[hwt 0]], socket 0[core 4[hwt 0]], socket 0[core 5[hwt 0]], socket 0[core 6[hwt 0]], socket 0[core 7[hwt 0]], socket 0[core 8[hwt 0]], socket 0[core 9[hwt 0]], socket 0[core 10[hwt 0]], socket 0[core 11[hwt 0]], socket 0[core 12[hwt 0]], socket 0[core 13[hwt 0]], socket 0[core 14[hwt 0]], socket 0[core 15[hwt 0]], socket 0[core 16[hwt 0]], socket 0[core 17[hwt 0]], socket 0[core 18[hwt 0]], socket 0[core 19[hwt 0]], socket 0[core 20[hwt 0]], socket 0[core 21[hwt 0]], socket 0[core 22[hwt 0]], socket 0[core 23[hwt 0]]:[B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B][./././././././././././././././././././././././.]
Process OMPI jobid: [51184,1] App: 0 Process rank: 1 Bound: socket 1[core 24[hwt 0]], socket 1[core 25[hwt 0]], socket 1[core 26[hwt 0]], socket 1[core 27[hwt 0]], socket 1[core 28[hwt 0]], socket 1[core 29[hwt 0]], socket 1[core 30[hwt 0]], socket 1[core 31[hwt 0]], socket 1[core 32[hwt 0]], socket 1[core 33[hwt 0]], socket 1[core 34[hwt 0]], socket 1[core 35[hwt 0]], socket 1[core 36[hwt 0]], socket 1[core 37[hwt 0]], socket 1[core 38[hwt 0]], socket 1[core 39[hwt 0]], socket 1[core 40[hwt 0]], socket 1[core 41[hwt 0]], socket 1[core 42[hwt 0]], socket 1[core 43[hwt 0]], socket 1[core 44[hwt 0]], socket 1[core 45[hwt 0]], socket 1[core 46[hwt 0]], socket 1[core 47[hwt 0]]:[./././././././././././././././././././././././.][B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B]
=============================================================
libibverbs: Warning: couldn't open config directory '/usr/local/etc/libibverbs.d'.
libibverbs: Warning: couldn't open config directory '/usr/local/etc/libibverbs.d'.
Allocating host memory for frequencies(5,) [20 B]
Allocating host memory for frequencies(5,) [20 B]
Allocating host memory for src_coords(0, 3) [0 B]
Allocating host memory for src_coords(1, 3) [12 B]
Allocating host memory for p0recre_coords(6, 3) [72 B]
Allocating host memory for p0recre_coords(6, 3) [72 B]
Operator `propagator` generated in 1.26 s
* lowering.IET: 0.61 s (48.7 %)
* specializing.IET: 0.51 s (40.8 %)
* make_mpi: 0.28 s (22.4 %)
* lowering.Clusters: 0.39 s (31.2 %)
Flops reduction after symbolic optimization: [152 --> 54]
Allocating host memory for p0(3, 41, 67, 67) [2 MB]
Operator `propagator` generated in 1.26 s
* lowering.IET: 0.61 s (48.5 %)
* specializing.IET: 0.51 s (40.6 %)
* make_mpi: 0.28 s (22.3 %)
* lowering.Clusters: 0.39 s (31.0 %)
Flops reduction after symbolic optimization: [152 --> 54]
Allocating host memory for p0(3, 42, 67, 67) [2 MB]
Allocating host memory for p0re(42, 67, 67, 5) [4 MB]
Allocating host memory for p0re(41, 67, 67, 5) [4 MB]
p_src_m=0 and p_src_M=-1 might cause no iterations along Dimension p_src
Operator `propagator` fetched `/tmp/devito-jitcache-uid1000/a653349b99b6968e90b8c82ca727c2089fdab675.cpp` in 0.13 s from jit-cache
Operator `propagator` fetched `/tmp/devito-jitcache-uid1000/a653349b99b6968e90b8c82ca727c2089fdab675.cpp` in 0.21 s from jit-cache
Operator `propagator` ran in 0.05 s
Operator `propagator` ran in 0.05 s
Global performance: [OI=0.52, 28.14 GFlops/s, 0.54 GPts/s]
Global performance: [OI=0.52, 28.14 GFlops/s, 0.54 GPts/s]
Local performance:
Local performance:
* multipass0[rank0]<> ran in 0.04 s [OI=0.52, 18.29 GFlops/s, 0.35 GPts/s]
* multipass0[rank0]<> ran in 0.04 s [OI=0.52, 18.29 GFlops/s, 0.35 GPts/s]
+ section0 ran in 0.01 s [9.28%]
+ section0 ran in 0.01 s [9.28%]
+ section1 ran in 0.04 s [90.55%]
+ section1 ran in 0.04 s [90.55%]
* multipass0[rank1]<> ran in 0.04 s [OI=0.52, 17.60 GFlops/s, 0.34 GPts/s]
* multipass0[rank1]<> ran in 0.04 s [OI=0.52, 17.60 GFlops/s, 0.34 GPts/s]
+ section0 ran in 0.01 s [9.29%]
+ section0 ran in 0.01 s [9.29%]
+ section1 ran in 0.04 s [90.65%]
+ section1 ran in 0.04 s [90.65%]
* section2[rank1]<<200,1,5,2,2,2>,<200,1,5,2,2,2>> ran in 0.01 s [OI=12.48, 0.69 GFlops/s, 0.01 GPts/s]
* section2[rank1]<<200,1,5,2,2,2>,<200,1,5,2,2,2>> ran in 0.01 s [OI=12.48, 0.69 GFlops/s, 0.01 GPts/s]
Performance[mode=advanced] arguments: {'nthreads': 22, 'nthreads_nonaffine': 22, 'x0_blk0_size': 8, 'y0_blk0_size': 8}
Performance[mode=advanced] arguments: {'nthreads': 22, 'nthreads_nonaffine': 22, 'x0_blk0_size': 8, 'y0_blk0_size': 8}
Operator `interpolator` generated in 0.27 s
* lowering.Clusters: 0.11 s (40.8 %)
* specializing.Clusters: 0.08 s (29.7 %)
* lowering.IET: 0.11 s (40.8 %)
* specializing.IET: 0.08 s (29.7 %)
Flops reduction after symbolic optimization: [36 --> 21]
Operator `interpolator` generated in 0.27 s
* lowering.Clusters: 0.11 s (40.8 %)
* specializing.Clusters: 0.08 s (29.7 %)
* lowering.IET: 0.11 s (40.8 %)
* specializing.IET: 0.08 s (29.7 %)
Flops reduction after symbolic optimization: [36 --> 21]
Allocating host memory for p0recre(6, 5) [120 B]
Allocating host memory for p0recre(6, 5) [120 B]
recdim_m=0 and recdim_M=-1 might cause no iterations along Dimension recdim
recdim_m=0 and recdim_M=-1 might cause no iterations along Dimension recdim
Operator `interpolator` fetched `/tmp/devito-jitcache-uid1000/c3edbe514ed93e1118c8a9b2ee38bcf004de8df5.cpp` in 0.02 s from jit-cache
Operator `interpolator` fetched `/tmp/devito-jitcache-uid1000/c3edbe514ed93e1118c8a9b2ee38bcf004de8df5.cpp` in 0.02 s from jit-cache
Operator `interpolator` ran in 0.01 s
Operator `interpolator` ran in 0.01 s
Global performance: [OI=3.05, 0.01 GFlops/s]
Global performance: [OI=3.05, 0.01 GFlops/s]
Local performance:
Local performance:
* section1[rank0]<<12,5>,<12,5,2,2,2>> ran in 0.01 s [OI=3.05, 0.04 GFlops/s, 0.00 GPts/s]
* section1[rank0]<<12,5>,<12,5,2,2,2>> ran in 0.01 s [OI=3.05, 0.04 GFlops/s, 0.00 GPts/s]
Performance[mode=advanced] arguments: {'nthreads_nonaffine': 22}
Performance[mode=advanced] arguments: {'nthreads_nonaffine': 22}
[[7.8738990e-14 1.2676886e-13 1.4294368e-13 1.4095608e-13 1.3165730e-13]
[7.8738990e-14 1.2676886e-13 1.4294368e-13 1.4095608e-13 1.3165730e-13]
[7.8738990e-14 1.2676886e-13 1.4294368e-13 1.4095608e-13 1.3165730e-13]
[0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]
[0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]
[0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]]
[[7.8738990e-14 1.2676886e-13 1.4294368e-13 1.4095608e-13 1.3165730e-13]
[7.8738990e-14 1.2676886e-13 1.4294368e-13 1.4095608e-13 1.3165730e-13]
[7.8738990e-14 1.2676886e-13 1.4294368e-13 1.4095608e-13 1.3165730e-13]
[0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]
[0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]
[0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]]
Here is the generated MPI propagator code:
#define _POSIX_C_SOURCE 200809L
#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);
#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;
#define MIN(a,b) (((a) < (b)) ? (a) : (b))
#include "stdlib.h"
#include "math.h"
#include "sys/time.h"
#include "xmmintrin.h"
#include "pmmintrin.h"
#include "mpi.h"
#include "omp.h"
struct dataobj
{
void *restrict data;
unsigned long * size;
unsigned long * npsize;
unsigned long * dsize;
int * hsize;
int * hofs;
int * oofs;
void * dmap;
} ;
struct neighborhood
{
int lll, llc, llr, lcl, lcc, lcr, lrl, lrc, lrr;
int cll, clc, clr, ccl, ccc, ccr, crl, crc, crr;
int rll, rlc, rlr, rcl, rcc, rcr, rrl, rrc, rrr;
} ;
struct profiler
{
double multipass0;
double section2;
double section0;
double section1;
} ;
extern "C" int propagator(struct dataobj *restrict frequencies_vec, struct dataobj *restrict p0_vec, struct dataobj *restrict p0re_vec, struct dataobj *restrict src_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int p_src_M, const int p_src_m, const int time_M, const int time_m, const int x0_blk0_size, const int y0_blk0_size, MPI_Comm comm, struct neighborhood * nb, const int nthreads, const int nthreads_nonaffine, struct profiler * timers);
static void sendrecv0(struct dataobj *restrict p0_vec, const int x_size, const int y_size, const int z_size, int ogtime, int ogx, int ogy, int ogz, int ostime, int osx, int osy, int osz, int fromrank, int torank, MPI_Comm comm, const int nthreads);
static void haloupdate0(struct dataobj *restrict p0_vec, MPI_Comm comm, struct neighborhood * nb, int otime, const int nthreads);
static void gather0(float *restrict buf_vec, int bx_size, int by_size, int bz_size, struct dataobj *restrict p0_vec, const int otime, const int ox, const int oy, const int oz, const int nthreads);
static void scatter0(float *restrict buf_vec, int bx_size, int by_size, int bz_size, struct dataobj *restrict p0_vec, const int otime, const int ox, const int oy, const int oz, const int nthreads);
int propagator(struct dataobj *restrict frequencies_vec, struct dataobj *restrict p0_vec, struct dataobj *restrict p0re_vec, struct dataobj *restrict src_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int p_src_M, const int p_src_m, const int time_M, const int time_m, const int x0_blk0_size, const int y0_blk0_size, MPI_Comm comm, struct neighborhood * nb, const int nthreads, const int nthreads_nonaffine, struct profiler * timers)
{
float *restrict r1_vec __attribute__ ((aligned (64)));
posix_memalign((void**)(&r1_vec),64,5*sizeof(float));
float (*restrict frequencies) __attribute__ ((aligned (64))) = (float (*)) frequencies_vec->data;
float (*restrict p0)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]]) p0_vec->data;
float (*restrict p0re)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]]) p0re_vec->data;
float (*restrict r1) __attribute__ ((aligned (64))) = (float (*)) r1_vec;
float (*restrict src_coords)[src_coords_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[src_coords_vec->size[1]]) src_coords_vec->data;
/* Flush denormal numbers to zero in hardware */
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
float r2 = 1.0F/(h_x*h_x);
float r3 = 1.0F/(h_y*h_y);
float r4 = 1.0F/(h_z*h_z);
for (int time = time_m, t0 = (time)%(3), t1 = (time + 2)%(3), t2 = (time + 1)%(3); time <= time_M; time += 1, t0 = (time)%(3), t1 = (time + 2)%(3), t2 = (time + 1)%(3))
{
START(multipass0)
START(section0)
#pragma omp parallel num_threads(nthreads)
{
#pragma omp for schedule(static,1)
for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
{
r1[freq_dim] = sin(6.28318e-2F*time*frequencies[freq_dim]);
}
}
STOP(section0,timers)
START(section1)
haloupdate0(p0_vec,comm,nb,t0,nthreads);
#pragma omp parallel num_threads(nthreads)
{
#pragma omp for collapse(2) schedule(dynamic,1)
for (int x0_blk0 = x_m; x0_blk0 <= x_M; x0_blk0 += x0_blk0_size)
{
for (int y0_blk0 = y_m; y0_blk0 <= y_M; y0_blk0 += y0_blk0_size)
{
for (int x = x0_blk0; x <= MIN(x_M, x0_blk0 + x0_blk0_size - 1); x += 1)
{
for (int y = y0_blk0; y <= MIN(y_M, y0_blk0 + y0_blk0_size - 1); y += 1)
{
for (int z = z_m; z <= z_M; z += 1)
{
float r5 = 1.42361111100763e-5F*p0[t0][x + 8][y + 8][z + 8];
p0[t2][x + 8][y + 8][z + 8] = r2*(r5 + 8.9285714284415e-9F*(p0[t0][x + 4][y + 8][z + 8] + p0[t0][x + 12][y + 8][z + 8]) + (-1.26984126982279e-7F)*(p0[t0][x + 5][y + 8][z + 8] + p0[t0][x + 11][y + 8][z + 8]) + 1.00000000005821e-6F*(p0[t0][x + 6][y + 8][z + 8] + p0[t0][x + 10][y + 8][z + 8]) + (-8.00000000046566e-6F)*(p0[t0][x + 7][y + 8][z + 8] + p0[t0][x + 9][y + 8][z + 8])) + r3*(r5 + 8.9285714284415e-9F*(p0[t0][x + 8][y + 4][z + 8] + p0[t0][x + 8][y + 12][z + 8]) + (-1.26984126982279e-7F)*(p0[t0][x + 8][y + 5][z + 8] + p0[t0][x + 8][y + 11][z + 8]) + 1.00000000005821e-6F*(p0[t0][x + 8][y + 6][z + 8] + p0[t0][x + 8][y + 10][z + 8]) + (-8.00000000046566e-6F)*(p0[t0][x + 8][y + 7][z + 8] + p0[t0][x + 8][y + 9][z + 8])) + r4*(r5 + 8.9285714284415e-9F*(p0[t0][x + 8][y + 8][z + 4] + p0[t0][x + 8][y + 8][z + 12]) + (-1.26984126982279e-7F)*(p0[t0][x + 8][y + 8][z + 5] + p0[t0][x + 8][y + 8][z + 11]) + 1.00000000005821e-6F*(p0[t0][x + 8][y + 8][z + 6] + p0[t0][x + 8][y + 8][z + 10]) + (-8.00000000046566e-6F)*(p0[t0][x + 8][y + 8][z + 7] + p0[t0][x + 8][y + 8][z + 9])) + p0[t1][x + 8][y + 8][z + 8];
#pragma omp simd aligned(p0,p0re:64)
for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
{
p0re[x + 8][y + 8][z + 8][freq_dim] = r1[freq_dim]*p0[t0][x + 8][y + 8][z + 8] + p0re[x + 8][y + 8][z + 8][freq_dim];
}
}
}
}
}
}
}
STOP(section1,timers)
STOP(multipass0,timers)
START(section2)
#pragma omp parallel num_threads(nthreads_nonaffine)
{
int chunk_size = (int)(fmax(1, (1.0F/3.0F)*(p_src_M - p_src_m + 1)/nthreads_nonaffine));
#pragma omp for schedule(dynamic,chunk_size)
for (int p_src = p_src_m; p_src <= p_src_M; p_src += 1)
{
for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
{
for (int rsrcx = 0; rsrcx <= 1; rsrcx += 1)
{
for (int rsrcy = 0; rsrcy <= 1; rsrcy += 1)
{
for (int rsrcz = 0; rsrcz <= 1; rsrcz += 1)
{
int posx = (int)(floor((-o_x + src_coords[p_src][0])/h_x));
int posy = (int)(floor((-o_y + src_coords[p_src][1])/h_y));
int posz = (int)(floor((-o_z + src_coords[p_src][2])/h_z));
float px = -floor((-o_x + src_coords[p_src][0])/h_x) + (-o_x + src_coords[p_src][0])/h_x;
float py = -floor((-o_y + src_coords[p_src][1])/h_y) + (-o_y + src_coords[p_src][1])/h_y;
float pz = -floor((-o_z + src_coords[p_src][2])/h_z) + (-o_z + src_coords[p_src][2])/h_z;
if (rsrcx + posx >= x_m - 1 && rsrcy + posy >= y_m - 1 && rsrcz + posz >= z_m - 1 && rsrcx + posx <= x_M + 1 && rsrcy + posy <= y_M + 1 && rsrcz + posz <= z_M + 1)
{
float r0 = (1.0F/100000.0F)*(rsrcx*px + (1 - rsrcx)*(1 - px))*(rsrcy*py + (1 - rsrcy)*(1 - py))*(rsrcz*pz + (1 - rsrcz)*(1 - pz))*sin(6.28318e-2F*time*frequencies[freq_dim]);
#pragma omp atomic update
p0[t2][rsrcx + posx + 8][rsrcy + posy + 8][rsrcz + posz + 8] += r0;
}
}
}
}
}
}
}
STOP(section2,timers)
}
free(r1_vec);
return 0;
}
static void sendrecv0(struct dataobj *restrict p0_vec, const int x_size, const int y_size, const int z_size, int ogtime, int ogx, int ogy, int ogz, int ostime, int osx, int osy, int osz, int fromrank, int torank, MPI_Comm comm, const int nthreads)
{
float *restrict bufg_vec __attribute__ ((aligned (64)));
posix_memalign((void**)(&bufg_vec),64,x_size*y_size*z_size*sizeof(float));
float *restrict bufs_vec __attribute__ ((aligned (64)));
posix_memalign((void**)(&bufs_vec),64,x_size*y_size*z_size*sizeof(float));
MPI_Request rrecv;
MPI_Request rsend;
MPI_Irecv(bufs_vec,x_size*y_size*z_size,MPI_FLOAT,fromrank,13,comm,&(rrecv));
if (torank != MPI_PROC_NULL)
{
gather0(bufg_vec,x_size,y_size,z_size,p0_vec,ogtime,ogx,ogy,ogz,nthreads);
}
MPI_Isend(bufg_vec,x_size*y_size*z_size,MPI_FLOAT,torank,13,comm,&(rsend));
MPI_Wait(&(rsend),MPI_STATUS_IGNORE);
MPI_Wait(&(rrecv),MPI_STATUS_IGNORE);
if (fromrank != MPI_PROC_NULL)
{
scatter0(bufs_vec,x_size,y_size,z_size,p0_vec,ostime,osx,osy,osz,nthreads);
}
free(bufg_vec);
free(bufs_vec);
}
static void haloupdate0(struct dataobj *restrict p0_vec, MPI_Comm comm, struct neighborhood * nb, int otime, const int nthreads)
{
sendrecv0(p0_vec,p0_vec->hsize[3],p0_vec->npsize[2],p0_vec->npsize[3],otime,p0_vec->oofs[2],p0_vec->hofs[4],p0_vec->hofs[6],otime,p0_vec->hofs[3],p0_vec->hofs[4],p0_vec->hofs[6],nb->rcc,nb->lcc,comm,nthreads);
sendrecv0(p0_vec,p0_vec->hsize[2],p0_vec->npsize[2],p0_vec->npsize[3],otime,p0_vec->oofs[3],p0_vec->hofs[4],p0_vec->hofs[6],otime,p0_vec->hofs[2],p0_vec->hofs[4],p0_vec->hofs[6],nb->lcc,nb->rcc,comm,nthreads);
sendrecv0(p0_vec,p0_vec->npsize[1],p0_vec->hsize[5],p0_vec->npsize[3],otime,p0_vec->hofs[2],p0_vec->oofs[4],p0_vec->hofs[6],otime,p0_vec->hofs[2],p0_vec->hofs[5],p0_vec->hofs[6],nb->crc,nb->clc,comm,nthreads);
sendrecv0(p0_vec,p0_vec->npsize[1],p0_vec->hsize[4],p0_vec->npsize[3],otime,p0_vec->hofs[2],p0_vec->oofs[5],p0_vec->hofs[6],otime,p0_vec->hofs[2],p0_vec->hofs[4],p0_vec->hofs[6],nb->clc,nb->crc,comm,nthreads);
sendrecv0(p0_vec,p0_vec->npsize[1],p0_vec->npsize[2],p0_vec->hsize[7],otime,p0_vec->hofs[2],p0_vec->hofs[4],p0_vec->oofs[6],otime,p0_vec->hofs[2],p0_vec->hofs[4],p0_vec->hofs[7],nb->ccr,nb->ccl,comm,nthreads);
sendrecv0(p0_vec,p0_vec->npsize[1],p0_vec->npsize[2],p0_vec->hsize[6],otime,p0_vec->hofs[2],p0_vec->hofs[4],p0_vec->oofs[7],otime,p0_vec->hofs[2],p0_vec->hofs[4],p0_vec->hofs[6],nb->ccl,nb->ccr,comm,nthreads);
}
static void gather0(float *restrict buf_vec, int bx_size, int by_size, int bz_size, struct dataobj *restrict p0_vec, const int otime, const int ox, const int oy, const int oz, const int nthreads)
{
float (*restrict buf)[bx_size][by_size][bz_size] __attribute__ ((aligned (64))) = (float (*)[bx_size][by_size][bz_size]) buf_vec;
float (*restrict p0)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]]) p0_vec->data;
const int x_m = 0;
const int y_m = 0;
const int z_m = 0;
const int x_M = bx_size - 1;
const int y_M = by_size - 1;
const int z_M = bz_size - 1;
#pragma omp parallel num_threads(nthreads)
{
#pragma omp for collapse(2) schedule(static,1)
for (int x = x_m; x <= x_M; x += 1)
{
for (int y = y_m; y <= y_M; y += 1)
{
#pragma omp simd aligned(p0:64)
for (int z = z_m; z <= z_M; z += 1)
{
buf[0][x][y][z] = p0[otime][x + ox][y + oy][z + oz];
}
}
}
}
}
static void scatter0(float *restrict buf_vec, int bx_size, int by_size, int bz_size, struct dataobj *restrict p0_vec, const int otime, const int ox, const int oy, const int oz, const int nthreads)
{
float (*restrict buf)[bx_size][by_size][bz_size] __attribute__ ((aligned (64))) = (float (*)[bx_size][by_size][bz_size]) buf_vec;
float (*restrict p0)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]]) p0_vec->data;
const int x_m = 0;
const int y_m = 0;
const int z_m = 0;
const int x_M = bx_size - 1;
const int y_M = by_size - 1;
const int z_M = bz_size - 1;
#pragma omp parallel num_threads(nthreads)
{
#pragma omp for collapse(2) schedule(static,1)
for (int x = x_m; x <= x_M; x += 1)
{
for (int y = y_m; y <= y_M; y += 1)
{
#pragma omp simd aligned(p0:64)
for (int z = z_m; z <= z_M; z += 1)
{
p0[otime][x + ox][y + oy][z + oz] = buf[0][x][y][z];
}
}
}
}
}
And the generated MPI interpolator code:
#define _POSIX_C_SOURCE 200809L
#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);
#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;
#include "stdlib.h"
#include "math.h"
#include "sys/time.h"
#include "xmmintrin.h"
#include "pmmintrin.h"
#include "mpi.h"
#include "omp.h"
struct dataobj
{
void *restrict data;
unsigned long * size;
unsigned long * npsize;
unsigned long * dsize;
int * hsize;
int * hofs;
int * oofs;
void * dmap;
} ;
struct profiler
{
double section0;
double section1;
} ;
extern "C" int interpolator(struct dataobj *restrict p0re_vec, struct dataobj *restrict p0recre_vec, struct dataobj *restrict p0recre_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int recdim_M, const int recdim_m, const int nthreads_nonaffine, struct profiler * timers);
int interpolator(struct dataobj *restrict p0re_vec, struct dataobj *restrict p0recre_vec, struct dataobj *restrict p0recre_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int recdim_M, const int recdim_m, const int nthreads_nonaffine, struct profiler * timers)
{
float (*restrict p0re)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]]) p0re_vec->data;
float (*restrict p0recre)[p0recre_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[p0recre_vec->size[1]]) p0recre_vec->data;
float (*restrict p0recre_coords)[p0recre_coords_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[p0recre_coords_vec->size[1]]) p0recre_coords_vec->data;
/* Flush denormal numbers to zero in hardware */
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
float r0 = 1.0F/h_x;
float r1 = 1.0F/h_y;
float r2 = 1.0F/h_z;
START(section0)
for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
{
}
STOP(section0,timers)
START(section1)
#pragma omp parallel num_threads(nthreads_nonaffine)
{
int chunk_size = (int)(fmax(1, (1.0F/3.0F)*(recdim_M - recdim_m + 1)/nthreads_nonaffine));
#pragma omp for schedule(dynamic,chunk_size)
for (int recdim = recdim_m; recdim <= recdim_M; recdim += 1)
{
for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
{
float r3 = floor(r0*(-o_x + p0recre_coords[recdim][0]));
int posx = (int)r3;
float r4 = floor(r1*(-o_y + p0recre_coords[recdim][1]));
int posy = (int)r4;
float r5 = floor(r2*(-o_z + p0recre_coords[recdim][2]));
int posz = (int)r5;
float px = r0*(-o_x + p0recre_coords[recdim][0]) - r3;
float py = r1*(-o_y + p0recre_coords[recdim][1]) - r4;
float pz = r2*(-o_z + p0recre_coords[recdim][2]) - r5;
float sum = 0.0F;
for (int rp0recrex = 0; rp0recrex <= 1; rp0recrex += 1)
{
for (int rp0recrey = 0; rp0recrey <= 1; rp0recrey += 1)
{
for (int rp0recrez = 0; rp0recrez <= 1; rp0recrez += 1)
{
if (rp0recrex + posx >= x_m - 1 && rp0recrey + posy >= y_m - 1 && rp0recrez + posz >= z_m - 1 && rp0recrex + posx <= x_M + 1 && rp0recrey + posy <= y_M + 1 && rp0recrez + posz <= z_M + 1)
{
sum += (rp0recrex*px + (1 - rp0recrex)*(1 - px))*(rp0recrey*py + (1 - rp0recrey)*(1 - py))*(rp0recrez*pz + (1 - rp0recrez)*(1 - pz))*p0re[rp0recrex + posx + 8][rp0recrey + posy + 8][rp0recrez + posz + 8][freq_dim];
}
}
}
}
p0recre[recdim][freq_dim] = sum;
}
}
}
STOP(section1,timers)
return 0;
}
Metadata
Assignees
Labels
No labels