Skip to content

Commit 12badfd

Browse files
refactor 9x9 blurring kernels
- also some minor compiler omp tweaks
1 parent 294616a commit 12badfd

File tree

4 files changed

+188
-89
lines changed

4 files changed

+188
-89
lines changed

src/develop/blend.c

Lines changed: 2 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -726,31 +726,8 @@ static void _refine_with_detail_mask_cl(struct dt_iop_module_t *self, struct dt_
726726
}
727727

728728
{
729-
// For a blurring sigma of 2.0f a 13x13 kernel would be optimally required but the 9x9 is by far good enough here
730-
float kernel[9][9];
731-
const float temp = -8.0f; // -2.0f * 2.0f * 2.0f; for a sigma of 2
732-
float sum = 0.0f;
733-
for(int i = -4; i <= 4; i++)
734-
{
735-
for(int j = -4; j <= 4; j++)
736-
{
737-
kernel[i + 4][j + 4] = expf( ((i*i) + (j*j)) / temp);
738-
sum += kernel[i + 4][j + 4];
739-
}
740-
}
741-
for(int i = 0; i < 9; i++)
742-
{
743-
#if defined(__GNUC__)
744-
#pragma GCC ivdep
745-
#endif
746-
for(int j = 0; j < 9; j++)
747-
kernel[i][j] /= sum;
748-
}
749-
750-
float blurmat[13] = { kernel[4][4], kernel[3][4], kernel[3][3], // 00: c00 c10 c11
751-
kernel[2][4], kernel[2][3], kernel[2][2], // 03: c20 c21 c22
752-
kernel[1][4], kernel[1][3], kernel[1][2], kernel[1][1], // 06: c30 c31 c32 c33
753-
kernel[0][4], kernel[0][3], kernel[0][2]}; // 10: c40 c41 c42
729+
float blurmat[13];
730+
dt_masks_blur_9x9_coeff(blurmat, 2.0f);
754731
cl_mem dev_blurmat = NULL;
755732
dev_blurmat = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 13, blurmat);
756733
if(dev_blurmat != NULL)
@@ -771,7 +748,6 @@ static void _refine_with_detail_mask_cl(struct dt_iop_module_t *self, struct dt_
771748
dt_opencl_release_mem_object(dev_blurmat);
772749
goto error;
773750
}
774-
775751
}
776752

777753
{

src/develop/masks.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -429,8 +429,9 @@ void dt_masks_calculate_source_pos_value(dt_masks_form_gui_t *gui, const int mas
429429
const float initial_ypos, const float xpos, const float ypos, float *px,
430430
float *py, const int adding);
431431

432-
/** luminance mask support */
432+
/** detail mask support */
433433
void dt_masks_extend_border(float *mask, const int width, const int height, const int border);
434+
void dt_masks_blur_9x9_coeff(float *coeffs, const float sigma);
434435
void dt_masks_blur_9x9(float *const src, float *const out, const int width, const int height, const float sigma);
435436
void dt_masks_calc_rawdetail_mask(float *const src, float *const out, float *const tmp, const int width,
436437
const int height, const dt_aligned_pixel_t wb);

src/develop/masks/detail.c

Lines changed: 182 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,18 @@ static inline float sqrf(float a)
9090
return a * a;
9191
}
9292

93+
#ifdef _OPENMP
94+
#pragma omp declare simd aligned(mask : 64)
95+
#endif
9396
void dt_masks_extend_border(float *mask, const int width, const int height, const int border)
9497
{
9598
if(border <= 0) return;
99+
#ifdef _OPENMP
100+
#pragma omp parallel for default(none) \
101+
dt_omp_firstprivate(mask) \
102+
dt_omp_sharedconst(width, height, border) \
103+
schedule(simd:static)
104+
#endif
96105
for(int row = border; row < height - border; row++)
97106
{
98107
const int idx = row * width;
@@ -102,6 +111,12 @@ void dt_masks_extend_border(float *mask, const int width, const int height, cons
102111
mask[idx + width - i - 1] = mask[idx + width - border -1];
103112
}
104113
}
114+
#ifdef _OPENMP
115+
#pragma omp parallel for default(none) \
116+
dt_omp_firstprivate(mask) \
117+
dt_omp_sharedconst(width, height, border) \
118+
schedule(simd:static)
119+
#endif
105120
for(int col = 0; col < width; col++)
106121
{
107122
const float top = mask[border * width + MIN(width - border - 1, MAX(col, border))];
@@ -114,49 +129,119 @@ void dt_masks_extend_border(float *mask, const int width, const int height, cons
114129
}
115130
}
116131

117-
#ifdef _OPENMP
118-
#pragma omp declare simd aligned(src, out : 64)
132+
void _masks_blur_5x5_coeff(float *c, const float sigma)
133+
{
134+
float kernel[5][5];
135+
const float temp = -2.0f * sqrf(sigma);
136+
const float range = sqrf(3.0f * 0.84f);
137+
float sum = 0.0f;
138+
for(int k = -2; k <= 2; k++)
139+
{
140+
for(int j = -2; j <= 2; j++)
141+
{
142+
if((sqrf(k) + sqrf(j)) <= range)
143+
{
144+
kernel[k + 2][j + 2] = expf((sqrf(k) + sqrf(j)) / temp);
145+
sum += kernel[k + 2][j + 2];
146+
}
147+
else
148+
kernel[k + 2][j + 2] = 0.0f;
149+
}
150+
}
151+
for(int i = 0; i < 5; i++)
152+
{
153+
#if defined(__GNUC__)
154+
#pragma GCC ivdep
119155
#endif
120-
void dt_masks_blur_9x9(float *const restrict src, float *const restrict out, const int width, const int height, const float sigma)
156+
for(int j = 0; j < 5; j++)
157+
kernel[i][j] /= sum;
158+
}
159+
/* c21 */ c[0] = kernel[0][1];
160+
/* c20 */ c[1] = kernel[0][2];
161+
/* c11 */ c[2] = kernel[1][1];
162+
/* c10 */ c[3] = kernel[1][2];
163+
/* c00 */ c[4] = kernel[2][2];
164+
}
165+
#define FAST_BLUR_5 ( \
166+
blurmat[0] * ((src[i - w2 - 1] + src[i - w2 + 1]) + (src[i - w1 - 2] + src[i - w1 + 2]) + (src[i + w1 - 2] + src[i + w1 + 2]) + (src[i + w2 - 1] + src[i + w2 + 1])) + \
167+
blurmat[1] * (src[i - w2] + src[i - 2] + src[i + 2] + src[i + w2]) + \
168+
blurmat[2] * (src[i - w1 - 1] + src[i - w1 + 1] + src[i + w1 - 1] + src[i + w1 + 1]) + \
169+
blurmat[3] * (src[i - w1] + src[i - 1] + src[i + 1] + src[i + w1]) + \
170+
blurmat[4] * src[i] )
171+
172+
void dt_masks_blur_9x9_coeff(float *c, const float sigma)
121173
{
122-
// For a blurring sigma of 2.0f a 13x13 kernel would be optimally required but the 9x9 is by far good enough here
123174
float kernel[9][9];
124-
const float temp = 2.0f * sqrf(sigma);
175+
const float temp = -2.0f * sqrf(sigma);
176+
const float range = sqrf(3.0f * 1.5f);
125177
float sum = 0.0f;
126-
for(int i = -4; i <= 4; i++)
178+
for(int k = -4; k <= 4; k++)
127179
{
128180
for(int j = -4; j <= 4; j++)
129181
{
130-
kernel[i + 4][j + 4] = expf( -(sqrf(i) + sqrf(j)) / temp);
131-
sum += kernel[i + 4][j + 4];
182+
if((sqrf(k) + sqrf(j)) <= range)
183+
{
184+
kernel[k + 4][j + 4] = expf((sqrf(k) + sqrf(j)) / temp);
185+
sum += kernel[k + 4][j + 4];
186+
}
187+
else
188+
kernel[k + 4][j + 4] = 0.0f;
132189
}
133190
}
134191
for(int i = 0; i < 9; i++)
135192
{
193+
#if defined(__GNUC__)
194+
#pragma GCC ivdep
195+
#endif
136196
for(int j = 0; j < 9; j++)
137197
kernel[i][j] /= sum;
138198
}
139-
const float c42 = kernel[0][2];
140-
const float c41 = kernel[0][3];
141-
const float c40 = kernel[0][4];
142-
const float c33 = kernel[1][1];
143-
const float c32 = kernel[1][2];
144-
const float c31 = kernel[1][3];
145-
const float c30 = kernel[1][4];
146-
const float c22 = kernel[2][2];
147-
const float c21 = kernel[2][3];
148-
const float c20 = kernel[2][4];
149-
const float c11 = kernel[3][3];
150-
const float c10 = kernel[3][4];
151-
const float c00 = kernel[4][4];
199+
/* c00 */ c[0] = kernel[4][4];
200+
/* c10 */ c[1] = kernel[3][4];
201+
/* c11 */ c[2] = kernel[3][3];
202+
/* c20 */ c[3] = kernel[2][4];
203+
/* c21 */ c[4] = kernel[2][3];
204+
/* c22 */ c[5] = kernel[2][2];
205+
/* c30 */ c[6] = kernel[1][4];
206+
/* c31 */ c[7] = kernel[1][3];
207+
/* c32 */ c[8] = kernel[1][2];
208+
/* c33 */ c[9] = kernel[1][1];
209+
/* c40 */ c[10] = kernel[0][4];
210+
/* c41 */ c[11] = kernel[0][3];
211+
/* c42 */ c[12] = kernel[0][2];
212+
}
213+
214+
#define FAST_BLUR_9 ( \
215+
blurmat[12] * (src[i - w4 - 2] + src[i - w4 + 2] + src[i - w2 - 4] + src[i - w2 + 4] + src[i + w2 - 4] + src[i + w2 + 4] + src[i + w4 - 2] + src[i + w4 + 2]) + \
216+
blurmat[11] * (src[i - w4 - 1] + src[i - w4 + 1] + src[i - w1 - 4] + src[i - w1 + 4] + src[i + w1 - 4] + src[i + w1 + 4] + src[i + w4 - 1] + src[i + w4 + 1]) + \
217+
blurmat[10] * (src[i - w4] + src[i - 4] + src[i + 4] + src[i + w4]) + \
218+
blurmat[9] * (src[i - w3 - 3] + src[i - w3 + 3] + src[i + w3 - 3] + src[i + w3 + 3]) + \
219+
blurmat[8] * (src[i - w3 - 2] + src[i - w3 + 2] + src[i - w2 - 3] + src[i - w2 + 3] + src[i + w2 - 3] + src[i + w2 + 3] + src[i + w3 - 2] + src[i + w3 + 2]) + \
220+
blurmat[7] * (src[i - w3 - 1] + src[i - w3 + 1] + src[i - w1 - 3] + src[i - w1 + 3] + src[i + w1 - 3] + src[i + w1 + 3] + src[i + w3 - 1] + src[i + w3 + 1]) + \
221+
blurmat[6] * (src[i - w3] + src[i - 3] + src[i + 3] + src[i + w3]) + \
222+
blurmat[5] * (src[i - w2 - 2] + src[i - w2 + 2] + src[i + w2 - 2] + src[i + w2 + 2]) + \
223+
blurmat[4] * (src[i - w2 - 1] + src[i - w2 + 1] + src[i - w1 - 2] + src[i - w1 + 2] + src[i + w1 - 2] + src[i + w1 + 2] + src[i + w2 - 1] + src[i + w2 + 1]) + \
224+
blurmat[3] * (src[i - w2] + src[i - 2] + src[i + 2] + src[i + w2]) + \
225+
blurmat[2] * (src[i - w1 - 1] + src[i - w1 + 1] + src[i + w1 - 1] + src[i + w1 + 1]) + \
226+
blurmat[1] * (src[i - w1] + src[i - 1] + src[i + 1] + src[i + w1]) + \
227+
blurmat[0] * src[i] )
228+
229+
#ifdef _OPENMP
230+
#pragma omp declare simd aligned(src, out : 64)
231+
#endif
232+
void dt_masks_blur_9x9(float *const restrict src, float *const restrict out, const int width, const int height, const float sigma)
233+
{
234+
float blurmat[13];
235+
dt_masks_blur_9x9_coeff(blurmat, sigma);
236+
152237
const int w1 = width;
153238
const int w2 = 2*width;
154239
const int w3 = 3*width;
155240
const int w4 = 4*width;
156241
#ifdef _OPENMP
157242
#pragma omp parallel for default(none) \
158243
dt_omp_firstprivate(src, out) \
159-
dt_omp_sharedconst(c42, c41, c40, c33, c32, c31, c30, c22, c21, c20, c11, c10, c00, w1, w2, w3, w4, width, height) \
244+
dt_omp_sharedconst(blurmat, width, height, w1, w2, w3, w4) \
160245
schedule(simd:static)
161246
#endif
162247
for(int row = 4; row < height - 4; row++)
@@ -169,25 +254,81 @@ void dt_masks_blur_9x9(float *const restrict src, float *const restrict out, con
169254
for(int col = 4; col < width - 4; col++)
170255
{
171256
const int i = row * width + col;
172-
const float val = c42 * (src[i - w4 - 2] + src[i - w4 + 2] + src[i - w2 - 4] + src[i - w2 + 4] + src[i + w2 - 4] + src[i + w2 + 4] + src[i + w4 - 2] + src[i + w4 + 2]) +
173-
c41 * (src[i - w4 - 1] + src[i - w4 + 1] + src[i - w1 - 4] + src[i - w1 + 4] + src[i + w1 - 4] + src[i + w1 + 4] + src[i + w4 - 1] + src[i + w4 + 1]) +
174-
c40 * (src[i - w4] + src[i - 4] + src[i + 4] + src[i + w4]) +
175-
c33 * (src[i - w3 - 3] + src[i - w3 + 3] + src[i + w3 - 3] + src[i + w3 + 3]) +
176-
c32 * (src[i - w3 - 2] + src[i - w3 + 2] + src[i - w2 - 3] + src[i - w2 + 3] + src[i + w2 - 3] + src[i + w2 + 3] + src[i + w3 - 2] + src[i + w3 + 2]) +
177-
c31 * (src[i - w3 - 1] + src[i - w3 + 1] + src[i - w1 - 3] + src[i - w1 + 3] + src[i + w1 - 3] + src[i + w1 + 3] + src[i + w3 - 1] + src[i + w3 + 1]) +
178-
c30 * (src[i - w3] + src[i - 3] + src[i + 3] + src[i + w3]) +
179-
c22 * (src[i - w2 - 2] + src[i - w2 + 2] + src[i + w2 - 2] + src[i + w2 + 2]) +
180-
c21 * (src[i - w2 - 1] + src[i - w2 + 1] + src[i - w1 - 2] + src[i - w1 + 2] + src[i + w1 - 2] + src[i + w1 + 2] + src[i + w2 - 1] + src[i + w2 + 1]) +
181-
c20 * (src[i - w2] + src[i - 2] + src[i + 2] + src[i + w2]) +
182-
c11 * (src[i - w1 - 1] + src[i - w1 + 1] + src[i + w1 - 1] + src[i + w1 + 1]) +
183-
c10 * (src[i - w1] + src[i - 1] + src[i + 1] + src[i + w1]) +
184-
c00 * src[i];
185-
out[i] = fminf(1.0f, fmaxf(0.0f, val));
257+
out[i] = fminf(1.0f, fmaxf(0.0f, FAST_BLUR_9));
186258
}
187259
}
188260
dt_masks_extend_border(out, width, height, 4);
189261
}
190262

263+
void _masks_blur_13x13_coeff(float *c, const float sigma)
264+
{
265+
float kernel[13][13];
266+
const float temp = -2.0f * sqrf(sigma);
267+
const float range = sqrf(3.0f * 2.0f);
268+
float sum = 0.0f;
269+
for(int k = -6; k <= 6; k++)
270+
{
271+
for(int j = -6; j <= 6; j++)
272+
{
273+
if((sqrf(k) + sqrf(j)) <= range)
274+
{
275+
kernel[k + 6][j + 6] = expf((sqrf(k) + sqrf(j)) / temp);
276+
sum += kernel[k + 6][j + 6];
277+
}
278+
else
279+
kernel[k + 6][j + 6] = 0.0f;
280+
}
281+
}
282+
for(int i = 0; i < 13; i++)
283+
{
284+
#if defined(__GNUC__)
285+
#pragma GCC ivdep
286+
#endif
287+
for(int j = 0; j < 13; j++)
288+
kernel[i][j] /= sum;
289+
}
290+
/* c60 */ c[0] = kernel[0][6];
291+
/* c53 */ c[1] = kernel[1][3];
292+
/* c52 */ c[2] = kernel[1][4];
293+
/* c51 */ c[3] = kernel[1][5];
294+
/* c50 */ c[4] = kernel[1][6];
295+
/* c44 */ c[5] = kernel[2][2];
296+
/* c42 */ c[6] = kernel[2][4];
297+
/* c41 */ c[7] = kernel[2][5];
298+
/* c40 */ c[8] = kernel[2][6];
299+
/* c33 */ c[9] = kernel[3][3];
300+
/* c32 */ c[10] = kernel[3][4];
301+
/* c31 */ c[11] = kernel[3][5];
302+
/* c30 */ c[12] = kernel[3][6];
303+
/* c22 */ c[13] = kernel[4][4];
304+
/* c21 */ c[14] = kernel[4][5];
305+
/* c20 */ c[15] = kernel[4][6];
306+
/* c11 */ c[16] = kernel[5][5];
307+
/* c10 */ c[17] = kernel[5][6];
308+
/* c00 */ c[18] = kernel[6][6];
309+
}
310+
311+
#define FAST_BLUR_13 ( \
312+
blurmat[0] * (src[i - w6] + src[i - 6] + src[i + 6] + src[i + w6]) + \
313+
blurmat[1] * ((src[i - w5 - 3] + src[i - w5 + 3]) + (src[i - w3 - 5] + src[i - w3 + 5]) + (src[i + w3 - 5] + src[i + w3 + 5]) + (src[i + w5 - 3] + src[i + w5 + 3])) + \
314+
blurmat[2] * ((src[i - w5 - 2] + src[i - w5 + 2]) + (src[i - w2 - 5] + src[i - w2 + 5]) + (src[i + w2 - 5] + src[i + w2 + 5]) + (src[i + w5 - 2] + src[i + w5 + 2])) + \
315+
blurmat[3] * ((src[i - w5 - 1] + src[i - w5 + 1]) + (src[i - w1 - 5] + src[i - w1 + 5]) + (src[i + w1 - 5] + src[i + w1 + 5]) + (src[i + w5 - 1] + src[i + w5 + 1])) + \
316+
blurmat[4] * ((src[i - w5] + src[i - 5] + src[i + 5] + src[i + w5]) + ((src[i - w4 - 3] + src[i - w4 + 3]) + (src[i - w3 - 4] + src[i - w3 + 4]) + (src[i + w3 - 4] + src[i + w3 + 4]) + (src[i + w4 - 3] + src[i + w4 + 3]))) + \
317+
blurmat[5] * (src[i - w4 - 4] + src[i - w4 + 4] + src[i + w4 - 4] + src[i + w4 + 4]) + \
318+
blurmat[6] * ((src[i - w4 - 2] + src[i - w4 + 2]) + (src[i - w2 - 4] + src[i - w2 + 4]) + (src[i + w2 - 4] + src[i + w2 + 4]) + (src[i + w4 - 2] + src[i + w4 + 2])) + \
319+
blurmat[7] * ((src[i - w4 - 1] + src[i - w4 + 1]) + (src[i - w1 - 4] + src[i - w1 + 4]) + (src[i + w1 - 4] + src[i + w1 + 4]) + (src[i + w4 - 1] + src[i + w4 + 1])) + \
320+
blurmat[8] * (src[i - w4] + src[i - 4] + src[i + 4] + src[i + w4]) + \
321+
blurmat[9] * (src[i - w3 - 3] + src[i - w3 + 3] + src[i + w3 - 3] + src[i + w3 + 3]) + \
322+
blurmat[10] * ((src[i - w3 - 2] + src[i - w3 + 2]) + (src[i - w2 - 3] + src[i - w2 + 3]) + (src[i + w2 - 3] + src[i + w2 + 3]) + (src[i + w3 - 2] + src[i + w3 + 2])) + \
323+
blurmat[11] * ((src[i - w3 - 1] + src[i - w3 + 1]) + (src[i - w1 - 3] + src[i - w1 + 3]) + (src[i + w1 - 3] + src[i + w1 + 3]) + (src[i + w3 - 1] + src[i + w3 + 1])) + \
324+
blurmat[12] * (src[i - w3] + src[i - 3] + src[i + 3] + src[i + w3]) + \
325+
blurmat[13] * (src[i - w2 - 2] + src[i - w2 + 2] + src[i + w2 - 2] + src[i + w2 + 2]) + \
326+
blurmat[14] * ((src[i - w2 - 1] + src[i - w2 + 1]) + (src[i - w1 - 2] + src[i - w1 + 2]) + (src[i + w1 - 2] + src[i + w1 + 2]) + (src[i + w2 - 1] + src[i + w2 + 1])) + \
327+
blurmat[15] * (src[i - w2] + src[i - 2] + src[i + 2] + src[i + w2]) + \
328+
blurmat[16] * (src[i - w1 - 1] + src[i - w1 + 1] + src[i + w1 - 1] + src[i + w1 + 1]) + \
329+
blurmat[17] * (src[i - w1] + src[i - 1] + src[i + 1] + src[i + w1]) + \
330+
blurmat[18] * src[i] )
331+
191332
void dt_masks_calc_rawdetail_mask(float *const restrict src, float *const restrict mask, float *const restrict tmp,
192333
const int width, const int height, const dt_aligned_pixel_t wb)
193334
{
@@ -244,7 +385,7 @@ void dt_masks_calc_detail_mask(float *const restrict src, float *const restrict
244385
#ifdef _OPENMP
245386
#pragma omp parallel for simd default(none) \
246387
dt_omp_firstprivate(src, tmp, msize, threshold, detail) \
247-
schedule(simd:static) aligned(src, tmp : 64)
388+
schedule(simd:static) aligned(src, tmp, out : 64)
248389
#endif
249390
for(int idx = 0; idx < msize; idx++)
250391
{
@@ -253,3 +394,7 @@ void dt_masks_calc_detail_mask(float *const restrict src, float *const restrict
253394
}
254395
dt_masks_blur_9x9(tmp, out, width, height, 2.0f);
255396
}
397+
#undef FAST_BLUR_5
398+
#undef FAST_BLUR_9
399+
#undef FAST_BLUR_13
400+

src/iop/dual_demosaic.c

Lines changed: 2 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -154,31 +154,8 @@ gboolean dual_demosaic_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *
154154
}
155155

156156
{
157-
// For a blurring sigma of 2.0f a 13x13 kernel would be optimally required but the 9x9 is by far good enough here
158-
float kernel[9][9];
159-
const float temp = -2.0f * (2.0f * 2.0f);
160-
float sum = 0.0f;
161-
for(int i = -4; i <= 4; i++)
162-
{
163-
for(int j = -4; j <= 4; j++)
164-
{
165-
kernel[i + 4][j + 4] = expf(((i*i) + (j*j)) / temp);
166-
sum += kernel[i + 4][j + 4];
167-
}
168-
}
169-
for(int i = 0; i < 9; i++)
170-
{
171-
#if defined(__GNUC__)
172-
#pragma GCC ivdep
173-
#endif
174-
for(int j = 0; j < 9; j++)
175-
kernel[i][j] /= sum;
176-
}
177-
178-
float blurmat[13] = { kernel[4][4], kernel[3][4], kernel[3][3], // 00: c00 c10 c11
179-
kernel[2][4], kernel[2][3], kernel[2][2], // 03: c20 c21 c22
180-
kernel[1][4], kernel[1][3], kernel[1][2], kernel[1][1], // 06: c30 c31 c32 c33
181-
kernel[0][4], kernel[0][3], kernel[0][2]}; // 10: c40 c41 c42
157+
float blurmat[13];
158+
dt_masks_blur_9x9_coeff(blurmat, 2.0f);
182159
cl_mem dev_blurmat = NULL;
183160
dev_blurmat = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 13, blurmat);
184161
if(dev_blurmat != NULL)

0 commit comments

Comments
 (0)