Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ New features
2.5.2 (upcoming)
================
- Fix python_requires in setup.py [#302]

- Faster `FALLBACK` kernels [#303]

2.5.1 (28/07/2023)
==================

Expand Down
66 changes: 15 additions & 51 deletions mocks/DDrppi_mocks/countpairs_rp_pi_mocks_kernels.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -1130,8 +1130,8 @@ static inline int countpairs_rp_pi_mocks_fallback_DOUBLE(const int64_t N0, DOUBL
const DOUBLE min_xdiff, const DOUBLE min_ydiff, const DOUBLE min_zdiff,
const DOUBLE closest_icell_xpos, const DOUBLE closest_icell_ypos,
const DOUBLE closest_icell_zpos,
DOUBLE *src_rpavg, uint64_t *src_npairs,
DOUBLE *src_weightavg, const weight_method_t weight_method)
DOUBLE *restrict src_rpavg, uint64_t *restrict src_npairs,
DOUBLE *restrict src_weightavg, const weight_method_t weight_method)
{
if(N0 == 0 || N1 == 0) {
return EXIT_SUCCESS;
Expand All @@ -1141,39 +1141,16 @@ static inline int countpairs_rp_pi_mocks_fallback_DOUBLE(const int64_t N0, DOUBL
return EXIT_FAILURE;
}

const int32_t need_rpavg = src_rpavg != NULL;
const int32_t need_weightavg = src_weightavg != NULL;

(void) fast_divide_and_NR_steps;//unused parameter but required to keep the same function signature amongst the kernels

/*----------------- FALLBACK CODE --------------------*/
const int64_t totnbins = (npibin+1)*(nbin+1);
const DOUBLE sqr_pimax = pimax*pimax;
const DOUBLE sqr_max_sep = sqr_rpmax + sqr_pimax;

uint64_t npairs[totnbins];
DOUBLE rpavg[totnbins], weightavg[totnbins];
for(int i=0;i<totnbins;i++) {
npairs[i] = 0;
if(need_rpavg) {
rpavg[i]=ZERO;
}
if(need_weightavg){
weightavg[i]=ZERO;
}
}

// A copy whose pointers we can advance
weight_struct_DOUBLE local_w0 = {.weights={NULL}, .num_weights=0}, local_w1 = {.weights={NULL}, .num_weights=0};
pair_struct_DOUBLE pair = {.num_weights=0};
weight_func_t_DOUBLE weight_func = NULL;
if(need_weightavg){
// Same particle list, new copy of num_weights pointers into that list
local_w0 = *weights0;
local_w1 = *weights1;

pair.num_weights = local_w0.num_weights;

if(src_weightavg != NULL){
pair.num_weights = weights0->num_weights;
weight_func = get_weight_func_by_method_DOUBLE(weight_method);
}

Expand All @@ -1187,7 +1164,7 @@ static inline int countpairs_rp_pi_mocks_fallback_DOUBLE(const int64_t N0, DOUBL
const DOUBLE ypos = *y0++;
const DOUBLE zpos = *z0++;
for(int w = 0; w < pair.num_weights; w++){
pair.weights0[w].d = *local_w0.weights[w]++;
pair.weights0[w].d = *(weights0->weights[w] + i);
}
DOUBLE max_dz = max_all_dz;

Expand Down Expand Up @@ -1249,9 +1226,6 @@ static inline int countpairs_rp_pi_mocks_fallback_DOUBLE(const int64_t N0, DOUBL
const int64_t n_off = localz1 - zstart;
DOUBLE *localx1 = x1 + n_off;
DOUBLE *localy1 = y1 + n_off;
for(int w = 0; w < local_w1.num_weights; w++){
local_w1.weights[w] = weights1->weights[w] + n_off;
}

const int64_t nleft = N1 - n_off;
for(int64_t j=0;j<nleft;j++){
Expand All @@ -1263,10 +1237,6 @@ static inline int countpairs_rp_pi_mocks_fallback_DOUBLE(const int64_t N0, DOUBL
const DOUBLE perpy = *localy1++ - ypos;
const DOUBLE perpz = *localz1++ - zpos;

for(int w = 0; w < pair.num_weights; w++){
pair.weights1[w].d = *local_w1.weights[w]++;
}

/* particles are sorted on z, all future j particles will have larger value in localz1 -> perpz will be
larger. therefore, if current perpz >= max_dz, all future iterations will also be larger. we can
terminate the loop */
Expand All @@ -1288,10 +1258,13 @@ static inline int countpairs_rp_pi_mocks_fallback_DOUBLE(const int64_t N0, DOUBL
if(sqr_Dperp >= sqr_rpmax || sqr_Dperp < sqr_rpmin) continue;

DOUBLE rp = ZERO, pairweight = ZERO;
if(need_rpavg) {
if(src_rpavg != NULL) {
rp = SQRT(sqr_Dperp);
}
if(need_weightavg){
if(src_weightavg != NULL){
for(int w = 0; w < pair.num_weights; w++){
pair.weights1[w].d = *(weights1->weights[w] + n_off + j);
}
pair.dx.d = perpx;
pair.dy.d = perpy;
pair.dz.d = perpz;
Expand All @@ -1306,28 +1279,19 @@ static inline int countpairs_rp_pi_mocks_fallback_DOUBLE(const int64_t N0, DOUBL
for(int kbin=nbin-1;kbin>=1;kbin--) {
if(sqr_Dperp >= rupp_sqr[kbin-1]) {
const int ibin = kbin*(npibin+1) + pibin;
npairs[ibin]++;
if(need_rpavg) {
rpavg[ibin]+=rp;
src_npairs[ibin]++;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess the idea here is that there's no point in making stack buffers, since the passed buffers are already local to the current thread?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup. I am also considering whether it would be worthwhile to move to a malloc'ed buffer rather than the stack (but there may be side-effects of false sharing under OpenMP with such a malloc'ed src_npairs[nthreads][nbins] kind of matrix)

if(src_rpavg != NULL) {
src_rpavg[ibin] += rp;
}
if(need_weightavg){
weightavg[ibin] += pairweight;
if(src_weightavg != NULL){
src_weightavg[ibin] += pairweight;
}
break;
}
}//finding kbin
}//j loop over second set of particles
}//i loop over first set of particles

for(int i=0;i<totnbins;i++) {
src_npairs[i] += npairs[i];
if(need_rpavg) {
src_rpavg[i] += rpavg[i];
}
if(need_weightavg){
src_weightavg[i] += weightavg[i];
}
}

return EXIT_SUCCESS;
}//end of fallback code
71 changes: 17 additions & 54 deletions mocks/DDsmu_mocks/countpairs_s_mu_mocks_kernels.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -1072,9 +1072,10 @@ static inline int countpairs_s_mu_mocks_fallback_DOUBLE(const int64_t N0, DOUBLE
const DOUBLE min_xdiff, const DOUBLE min_ydiff, const DOUBLE min_zdiff,
const DOUBLE closest_icell_xpos, const DOUBLE closest_icell_ypos,
const DOUBLE closest_icell_zpos,
DOUBLE *src_savg, uint64_t *src_npairs,
DOUBLE *src_weightavg, const weight_method_t weight_method)
DOUBLE *restrict src_savg, uint64_t *restrict src_npairs,
DOUBLE *restrict src_weightavg, const weight_method_t weight_method)
{
/*----------------- FALLBACK CODE --------------------*/
if(N0 == 0 || N1 == 0) {
return EXIT_SUCCESS;
}
Expand All @@ -1084,39 +1085,15 @@ static inline int countpairs_s_mu_mocks_fallback_DOUBLE(const int64_t N0, DOUBLE
}

(void) fast_divide_and_NR_steps;//unused parameter but required to keep the same function signature amongst the kernels
const int32_t need_savg = src_savg != NULL;
const int32_t need_weightavg = src_weightavg != NULL;

const DOUBLE sqr_mumax = mu_max*mu_max;

/*----------------- FALLBACK CODE --------------------*/
const int64_t totnbins = (nmu_bins+1)*(nsbin+1);

uint64_t npairs[totnbins];
DOUBLE savg[totnbins], weightavg[totnbins];
for(int i=0;i<totnbins;i++) {
npairs[i] = ZERO;
if(need_savg) {
savg[i]=ZERO;
}
if(need_weightavg){
weightavg[i]=ZERO;
}
}

// A copy whose pointers we can advance
weight_struct_DOUBLE local_w0 = {.weights={NULL}, .num_weights=0},
local_w1 = {.weights={NULL}, .num_weights=0};
pair_struct_DOUBLE pair = {.num_weights=0};
weight_func_t_DOUBLE weight_func = NULL;
if(need_weightavg){
// Same particle list, new copy of num_weights pointers into that list
local_w0 = *weights0;
local_w1 = *weights1;
pair.num_weights = local_w0.num_weights;
if(src_weightavg != NULL){
pair.num_weights = weights0->num_weights;
weight_func = get_weight_func_by_method_DOUBLE(weight_method);
}

const DOUBLE sqr_mumax = mu_max*mu_max;
const DOUBLE dmu = mu_max/(DOUBLE) nmu_bins;
const DOUBLE inv_dmu = 1.0/dmu;

Expand All @@ -1127,7 +1104,7 @@ static inline int countpairs_s_mu_mocks_fallback_DOUBLE(const int64_t N0, DOUBLE
const DOUBLE ypos = *y0++;
const DOUBLE zpos = *z0++;
for(int w = 0; w < pair.num_weights; w++){
pair.weights0[w].d = *local_w0.weights[w]++;
pair.weights0[w].d = *(weights0->weights[w] + i);
}

DOUBLE max_dz = max_all_dz;
Expand Down Expand Up @@ -1191,9 +1168,6 @@ static inline int countpairs_s_mu_mocks_fallback_DOUBLE(const int64_t N0, DOUBLE
const int64_t nleft = N1 - n_off;
DOUBLE *localx1 = x1 + n_off;
DOUBLE *localy1 = y1 + n_off;
for(int w = 0; w < pair.num_weights; w++){
local_w1.weights[w] = weights1->weights[w] + n_off;
}

for(int64_t j=0;j<nleft;j++){
const DOUBLE parx = xpos + *localx1;
Expand All @@ -1204,10 +1178,6 @@ static inline int countpairs_s_mu_mocks_fallback_DOUBLE(const int64_t N0, DOUBLE
const DOUBLE perpy = *localy1++ - ypos;
const DOUBLE perpz = *localz1++ - zpos;

for(int w = 0; w < pair.num_weights; w++){
pair.weights1[w].d = *local_w1.weights[w]++;
}

if(perpz >= max_dz) break;

const DOUBLE sqr_s = perpx*perpx + perpy*perpy + perpz*perpz;
Expand All @@ -1219,10 +1189,13 @@ static inline int countpairs_s_mu_mocks_fallback_DOUBLE(const int64_t N0, DOUBLE
const DOUBLE sqr_mu = sqr_s_dot_l/(sqr_l * sqr_s);
const int mubin = (sqr_mu >= sqr_mumax) ? nmu_bins:(int) (SQRT(sqr_mu)*inv_dmu);
DOUBLE s = ZERO, pairweight = ZERO;
if(need_savg) {
if(src_savg != NULL) {
s = SQRT(sqr_s);
}
if(need_weightavg){
if(src_weightavg != NULL){
for(int w = 0; w < pair.num_weights; w++){
pair.weights1[w].d = *(weights1->weights[w] + n_off + j);
}
pair.dx.d = perpx;
pair.dy.d = perpy;
pair.dz.d = perpz;
Expand All @@ -1237,28 +1210,18 @@ static inline int countpairs_s_mu_mocks_fallback_DOUBLE(const int64_t N0, DOUBLE
for(int kbin=nsbin-1;kbin>=1;kbin--) {
if(sqr_s >= supp_sqr[kbin-1]) {
const int ibin = kbin*(nmu_bins+1) + mubin;
npairs[ibin]++;
if(need_savg) {
savg[ibin]+=s;
src_npairs[ibin]++;
if(src_savg != NULL) {
src_savg[ibin] += s;
}
if(need_weightavg){
weightavg[ibin] += pairweight;
if(src_weightavg != NULL){
src_weightavg[ibin] += pairweight;
}
break;
}
}//finding kbin
}//j loop over second set of particles
}//i loop over first set of particles

for(int i=0;i<totnbins;i++) {
src_npairs[i] += npairs[i];
if(need_savg) {
src_savg[i] += savg[i];
}
if(need_weightavg){
src_weightavg[i] += weightavg[i];
}
}

return EXIT_SUCCESS;
}//end of fallback code
Loading