@@ -90,9 +90,18 @@ static inline float sqrf(float a)
90
90
return a * a ;
91
91
}
92
92
93
+ #ifdef _OPENMP
94
+ #pragma omp declare simd aligned(mask : 64)
95
+ #endif
93
96
void dt_masks_extend_border (float * mask , const int width , const int height , const int border )
94
97
{
95
98
if (border <= 0 ) return ;
99
+ #ifdef _OPENMP
100
+ #pragma omp parallel for default(none) \
101
+ dt_omp_firstprivate(mask) \
102
+ dt_omp_sharedconst(width, height, border) \
103
+ schedule(simd:static)
104
+ #endif
96
105
for (int row = border ; row < height - border ; row ++ )
97
106
{
98
107
const int idx = row * width ;
@@ -102,6 +111,12 @@ void dt_masks_extend_border(float *mask, const int width, const int height, cons
102
111
mask [idx + width - i - 1 ] = mask [idx + width - border - 1 ];
103
112
}
104
113
}
114
+ #ifdef _OPENMP
115
+ #pragma omp parallel for default(none) \
116
+ dt_omp_firstprivate(mask) \
117
+ dt_omp_sharedconst(width, height, border) \
118
+ schedule(simd:static)
119
+ #endif
105
120
for (int col = 0 ; col < width ; col ++ )
106
121
{
107
122
const float top = mask [border * width + MIN (width - border - 1 , MAX (col , border ))];
@@ -114,49 +129,119 @@ void dt_masks_extend_border(float *mask, const int width, const int height, cons
114
129
}
115
130
}
116
131
117
- #ifdef _OPENMP
118
- #pragma omp declare simd aligned(src, out : 64)
132
+ void _masks_blur_5x5_coeff (float * c , const float sigma )
133
+ {
134
+ float kernel [5 ][5 ];
135
+ const float temp = -2.0f * sqrf (sigma );
136
+ const float range = sqrf (3.0f * 0.84f );
137
+ float sum = 0.0f ;
138
+ for (int k = -2 ; k <= 2 ; k ++ )
139
+ {
140
+ for (int j = -2 ; j <= 2 ; j ++ )
141
+ {
142
+ if ((sqrf (k ) + sqrf (j )) <= range )
143
+ {
144
+ kernel [k + 2 ][j + 2 ] = expf ((sqrf (k ) + sqrf (j )) / temp );
145
+ sum += kernel [k + 2 ][j + 2 ];
146
+ }
147
+ else
148
+ kernel [k + 2 ][j + 2 ] = 0.0f ;
149
+ }
150
+ }
151
+ for (int i = 0 ; i < 5 ; i ++ )
152
+ {
153
+ #if defined(__GNUC__ )
154
+ #pragma GCC ivdep
119
155
#endif
120
- void dt_masks_blur_9x9 (float * const restrict src , float * const restrict out , const int width , const int height , const float sigma )
156
+ for (int j = 0 ; j < 5 ; j ++ )
157
+ kernel [i ][j ] /= sum ;
158
+ }
159
+ /* c21 */ c [0 ] = kernel [0 ][1 ];
160
+ /* c20 */ c [1 ] = kernel [0 ][2 ];
161
+ /* c11 */ c [2 ] = kernel [1 ][1 ];
162
+ /* c10 */ c [3 ] = kernel [1 ][2 ];
163
+ /* c00 */ c [4 ] = kernel [2 ][2 ];
164
+ }
165
+ #define FAST_BLUR_5 ( \
166
+ blurmat[0] * ((src[i - w2 - 1] + src[i - w2 + 1]) + (src[i - w1 - 2] + src[i - w1 + 2]) + (src[i + w1 - 2] + src[i + w1 + 2]) + (src[i + w2 - 1] + src[i + w2 + 1])) + \
167
+ blurmat[1] * (src[i - w2] + src[i - 2] + src[i + 2] + src[i + w2]) + \
168
+ blurmat[2] * (src[i - w1 - 1] + src[i - w1 + 1] + src[i + w1 - 1] + src[i + w1 + 1]) + \
169
+ blurmat[3] * (src[i - w1] + src[i - 1] + src[i + 1] + src[i + w1]) + \
170
+ blurmat[4] * src[i] )
171
+
172
+ void dt_masks_blur_9x9_coeff (float * c , const float sigma )
121
173
{
122
- // For a blurring sigma of 2.0f a 13x13 kernel would be optimally required but the 9x9 is by far good enough here
123
174
float kernel [9 ][9 ];
124
- const float temp = 2.0f * sqrf (sigma );
175
+ const float temp = -2.0f * sqrf (sigma );
176
+ const float range = sqrf (3.0f * 1.5f );
125
177
float sum = 0.0f ;
126
- for (int i = -4 ; i <= 4 ; i ++ )
178
+ for (int k = -4 ; k <= 4 ; k ++ )
127
179
{
128
180
for (int j = -4 ; j <= 4 ; j ++ )
129
181
{
130
- kernel [i + 4 ][j + 4 ] = expf ( - (sqrf (i ) + sqrf (j )) / temp );
131
- sum += kernel [i + 4 ][j + 4 ];
182
+ if ((sqrf (k ) + sqrf (j )) <= range )
183
+ {
184
+ kernel [k + 4 ][j + 4 ] = expf ((sqrf (k ) + sqrf (j )) / temp );
185
+ sum += kernel [k + 4 ][j + 4 ];
186
+ }
187
+ else
188
+ kernel [k + 4 ][j + 4 ] = 0.0f ;
132
189
}
133
190
}
134
191
for (int i = 0 ; i < 9 ; i ++ )
135
192
{
193
+ #if defined(__GNUC__ )
194
+ #pragma GCC ivdep
195
+ #endif
136
196
for (int j = 0 ; j < 9 ; j ++ )
137
197
kernel [i ][j ] /= sum ;
138
198
}
139
- const float c42 = kernel [0 ][2 ];
140
- const float c41 = kernel [0 ][3 ];
141
- const float c40 = kernel [0 ][4 ];
142
- const float c33 = kernel [1 ][1 ];
143
- const float c32 = kernel [1 ][2 ];
144
- const float c31 = kernel [1 ][3 ];
145
- const float c30 = kernel [1 ][4 ];
146
- const float c22 = kernel [2 ][2 ];
147
- const float c21 = kernel [2 ][3 ];
148
- const float c20 = kernel [2 ][4 ];
149
- const float c11 = kernel [3 ][3 ];
150
- const float c10 = kernel [3 ][4 ];
151
- const float c00 = kernel [4 ][4 ];
199
+ /* c00 */ c [0 ] = kernel [4 ][4 ];
200
+ /* c10 */ c [1 ] = kernel [3 ][4 ];
201
+ /* c11 */ c [2 ] = kernel [3 ][3 ];
202
+ /* c20 */ c [3 ] = kernel [2 ][4 ];
203
+ /* c21 */ c [4 ] = kernel [2 ][3 ];
204
+ /* c22 */ c [5 ] = kernel [2 ][2 ];
205
+ /* c30 */ c [6 ] = kernel [1 ][4 ];
206
+ /* c31 */ c [7 ] = kernel [1 ][3 ];
207
+ /* c32 */ c [8 ] = kernel [1 ][2 ];
208
+ /* c33 */ c [9 ] = kernel [1 ][1 ];
209
+ /* c40 */ c [10 ] = kernel [0 ][4 ];
210
+ /* c41 */ c [11 ] = kernel [0 ][3 ];
211
+ /* c42 */ c [12 ] = kernel [0 ][2 ];
212
+ }
213
+
214
+ #define FAST_BLUR_9 ( \
215
+ blurmat[12] * (src[i - w4 - 2] + src[i - w4 + 2] + src[i - w2 - 4] + src[i - w2 + 4] + src[i + w2 - 4] + src[i + w2 + 4] + src[i + w4 - 2] + src[i + w4 + 2]) + \
216
+ blurmat[11] * (src[i - w4 - 1] + src[i - w4 + 1] + src[i - w1 - 4] + src[i - w1 + 4] + src[i + w1 - 4] + src[i + w1 + 4] + src[i + w4 - 1] + src[i + w4 + 1]) + \
217
+ blurmat[10] * (src[i - w4] + src[i - 4] + src[i + 4] + src[i + w4]) + \
218
+ blurmat[9] * (src[i - w3 - 3] + src[i - w3 + 3] + src[i + w3 - 3] + src[i + w3 + 3]) + \
219
+ blurmat[8] * (src[i - w3 - 2] + src[i - w3 + 2] + src[i - w2 - 3] + src[i - w2 + 3] + src[i + w2 - 3] + src[i + w2 + 3] + src[i + w3 - 2] + src[i + w3 + 2]) + \
220
+ blurmat[7] * (src[i - w3 - 1] + src[i - w3 + 1] + src[i - w1 - 3] + src[i - w1 + 3] + src[i + w1 - 3] + src[i + w1 + 3] + src[i + w3 - 1] + src[i + w3 + 1]) + \
221
+ blurmat[6] * (src[i - w3] + src[i - 3] + src[i + 3] + src[i + w3]) + \
222
+ blurmat[5] * (src[i - w2 - 2] + src[i - w2 + 2] + src[i + w2 - 2] + src[i + w2 + 2]) + \
223
+ blurmat[4] * (src[i - w2 - 1] + src[i - w2 + 1] + src[i - w1 - 2] + src[i - w1 + 2] + src[i + w1 - 2] + src[i + w1 + 2] + src[i + w2 - 1] + src[i + w2 + 1]) + \
224
+ blurmat[3] * (src[i - w2] + src[i - 2] + src[i + 2] + src[i + w2]) + \
225
+ blurmat[2] * (src[i - w1 - 1] + src[i - w1 + 1] + src[i + w1 - 1] + src[i + w1 + 1]) + \
226
+ blurmat[1] * (src[i - w1] + src[i - 1] + src[i + 1] + src[i + w1]) + \
227
+ blurmat[0] * src[i] )
228
+
229
+ #ifdef _OPENMP
230
+ #pragma omp declare simd aligned(src, out : 64)
231
+ #endif
232
+ void dt_masks_blur_9x9 (float * const restrict src , float * const restrict out , const int width , const int height , const float sigma )
233
+ {
234
+ float blurmat [13 ];
235
+ dt_masks_blur_9x9_coeff (blurmat , sigma );
236
+
152
237
const int w1 = width ;
153
238
const int w2 = 2 * width ;
154
239
const int w3 = 3 * width ;
155
240
const int w4 = 4 * width ;
156
241
#ifdef _OPENMP
157
242
#pragma omp parallel for default(none) \
158
243
dt_omp_firstprivate(src, out) \
159
- dt_omp_sharedconst(c42, c41, c40, c33, c32, c31, c30, c22, c21, c20, c11, c10, c00, w1, w2, w3, w4, width, height ) \
244
+ dt_omp_sharedconst(blurmat, width, height, w1, w2, w3, w4) \
160
245
schedule(simd:static)
161
246
#endif
162
247
for (int row = 4 ; row < height - 4 ; row ++ )
@@ -169,25 +254,81 @@ void dt_masks_blur_9x9(float *const restrict src, float *const restrict out, con
169
254
for (int col = 4 ; col < width - 4 ; col ++ )
170
255
{
171
256
const int i = row * width + col ;
172
- const float val = c42 * (src [i - w4 - 2 ] + src [i - w4 + 2 ] + src [i - w2 - 4 ] + src [i - w2 + 4 ] + src [i + w2 - 4 ] + src [i + w2 + 4 ] + src [i + w4 - 2 ] + src [i + w4 + 2 ]) +
173
- c41 * (src [i - w4 - 1 ] + src [i - w4 + 1 ] + src [i - w1 - 4 ] + src [i - w1 + 4 ] + src [i + w1 - 4 ] + src [i + w1 + 4 ] + src [i + w4 - 1 ] + src [i + w4 + 1 ]) +
174
- c40 * (src [i - w4 ] + src [i - 4 ] + src [i + 4 ] + src [i + w4 ]) +
175
- c33 * (src [i - w3 - 3 ] + src [i - w3 + 3 ] + src [i + w3 - 3 ] + src [i + w3 + 3 ]) +
176
- c32 * (src [i - w3 - 2 ] + src [i - w3 + 2 ] + src [i - w2 - 3 ] + src [i - w2 + 3 ] + src [i + w2 - 3 ] + src [i + w2 + 3 ] + src [i + w3 - 2 ] + src [i + w3 + 2 ]) +
177
- c31 * (src [i - w3 - 1 ] + src [i - w3 + 1 ] + src [i - w1 - 3 ] + src [i - w1 + 3 ] + src [i + w1 - 3 ] + src [i + w1 + 3 ] + src [i + w3 - 1 ] + src [i + w3 + 1 ]) +
178
- c30 * (src [i - w3 ] + src [i - 3 ] + src [i + 3 ] + src [i + w3 ]) +
179
- c22 * (src [i - w2 - 2 ] + src [i - w2 + 2 ] + src [i + w2 - 2 ] + src [i + w2 + 2 ]) +
180
- c21 * (src [i - w2 - 1 ] + src [i - w2 + 1 ] + src [i - w1 - 2 ] + src [i - w1 + 2 ] + src [i + w1 - 2 ] + src [i + w1 + 2 ] + src [i + w2 - 1 ] + src [i + w2 + 1 ]) +
181
- c20 * (src [i - w2 ] + src [i - 2 ] + src [i + 2 ] + src [i + w2 ]) +
182
- c11 * (src [i - w1 - 1 ] + src [i - w1 + 1 ] + src [i + w1 - 1 ] + src [i + w1 + 1 ]) +
183
- c10 * (src [i - w1 ] + src [i - 1 ] + src [i + 1 ] + src [i + w1 ]) +
184
- c00 * src [i ];
185
- out [i ] = fminf (1.0f , fmaxf (0.0f , val ));
257
+ out [i ] = fminf (1.0f , fmaxf (0.0f , FAST_BLUR_9 ));
186
258
}
187
259
}
188
260
dt_masks_extend_border (out , width , height , 4 );
189
261
}
190
262
263
+ void _masks_blur_13x13_coeff (float * c , const float sigma )
264
+ {
265
+ float kernel [13 ][13 ];
266
+ const float temp = -2.0f * sqrf (sigma );
267
+ const float range = sqrf (3.0f * 2.0f );
268
+ float sum = 0.0f ;
269
+ for (int k = -6 ; k <= 6 ; k ++ )
270
+ {
271
+ for (int j = -6 ; j <= 6 ; j ++ )
272
+ {
273
+ if ((sqrf (k ) + sqrf (j )) <= range )
274
+ {
275
+ kernel [k + 6 ][j + 6 ] = expf ((sqrf (k ) + sqrf (j )) / temp );
276
+ sum += kernel [k + 6 ][j + 6 ];
277
+ }
278
+ else
279
+ kernel [k + 6 ][j + 6 ] = 0.0f ;
280
+ }
281
+ }
282
+ for (int i = 0 ; i < 13 ; i ++ )
283
+ {
284
+ #if defined(__GNUC__ )
285
+ #pragma GCC ivdep
286
+ #endif
287
+ for (int j = 0 ; j < 13 ; j ++ )
288
+ kernel [i ][j ] /= sum ;
289
+ }
290
+ /* c60 */ c [0 ] = kernel [0 ][6 ];
291
+ /* c53 */ c [1 ] = kernel [1 ][3 ];
292
+ /* c52 */ c [2 ] = kernel [1 ][4 ];
293
+ /* c51 */ c [3 ] = kernel [1 ][5 ];
294
+ /* c50 */ c [4 ] = kernel [1 ][6 ];
295
+ /* c44 */ c [5 ] = kernel [2 ][2 ];
296
+ /* c42 */ c [6 ] = kernel [2 ][4 ];
297
+ /* c41 */ c [7 ] = kernel [2 ][5 ];
298
+ /* c40 */ c [8 ] = kernel [2 ][6 ];
299
+ /* c33 */ c [9 ] = kernel [3 ][3 ];
300
+ /* c32 */ c [10 ] = kernel [3 ][4 ];
301
+ /* c31 */ c [11 ] = kernel [3 ][5 ];
302
+ /* c30 */ c [12 ] = kernel [3 ][6 ];
303
+ /* c22 */ c [13 ] = kernel [4 ][4 ];
304
+ /* c21 */ c [14 ] = kernel [4 ][5 ];
305
+ /* c20 */ c [15 ] = kernel [4 ][6 ];
306
+ /* c11 */ c [16 ] = kernel [5 ][5 ];
307
+ /* c10 */ c [17 ] = kernel [5 ][6 ];
308
+ /* c00 */ c [18 ] = kernel [6 ][6 ];
309
+ }
310
+
311
+ #define FAST_BLUR_13 ( \
312
+ blurmat[0] * (src[i - w6] + src[i - 6] + src[i + 6] + src[i + w6]) + \
313
+ blurmat[1] * ((src[i - w5 - 3] + src[i - w5 + 3]) + (src[i - w3 - 5] + src[i - w3 + 5]) + (src[i + w3 - 5] + src[i + w3 + 5]) + (src[i + w5 - 3] + src[i + w5 + 3])) + \
314
+ blurmat[2] * ((src[i - w5 - 2] + src[i - w5 + 2]) + (src[i - w2 - 5] + src[i - w2 + 5]) + (src[i + w2 - 5] + src[i + w2 + 5]) + (src[i + w5 - 2] + src[i + w5 + 2])) + \
315
+ blurmat[3] * ((src[i - w5 - 1] + src[i - w5 + 1]) + (src[i - w1 - 5] + src[i - w1 + 5]) + (src[i + w1 - 5] + src[i + w1 + 5]) + (src[i + w5 - 1] + src[i + w5 + 1])) + \
316
+ blurmat[4] * ((src[i - w5] + src[i - 5] + src[i + 5] + src[i + w5]) + ((src[i - w4 - 3] + src[i - w4 + 3]) + (src[i - w3 - 4] + src[i - w3 + 4]) + (src[i + w3 - 4] + src[i + w3 + 4]) + (src[i + w4 - 3] + src[i + w4 + 3]))) + \
317
+ blurmat[5] * (src[i - w4 - 4] + src[i - w4 + 4] + src[i + w4 - 4] + src[i + w4 + 4]) + \
318
+ blurmat[6] * ((src[i - w4 - 2] + src[i - w4 + 2]) + (src[i - w2 - 4] + src[i - w2 + 4]) + (src[i + w2 - 4] + src[i + w2 + 4]) + (src[i + w4 - 2] + src[i + w4 + 2])) + \
319
+ blurmat[7] * ((src[i - w4 - 1] + src[i - w4 + 1]) + (src[i - w1 - 4] + src[i - w1 + 4]) + (src[i + w1 - 4] + src[i + w1 + 4]) + (src[i + w4 - 1] + src[i + w4 + 1])) + \
320
+ blurmat[8] * (src[i - w4] + src[i - 4] + src[i + 4] + src[i + w4]) + \
321
+ blurmat[9] * (src[i - w3 - 3] + src[i - w3 + 3] + src[i + w3 - 3] + src[i + w3 + 3]) + \
322
+ blurmat[10] * ((src[i - w3 - 2] + src[i - w3 + 2]) + (src[i - w2 - 3] + src[i - w2 + 3]) + (src[i + w2 - 3] + src[i + w2 + 3]) + (src[i + w3 - 2] + src[i + w3 + 2])) + \
323
+ blurmat[11] * ((src[i - w3 - 1] + src[i - w3 + 1]) + (src[i - w1 - 3] + src[i - w1 + 3]) + (src[i + w1 - 3] + src[i + w1 + 3]) + (src[i + w3 - 1] + src[i + w3 + 1])) + \
324
+ blurmat[12] * (src[i - w3] + src[i - 3] + src[i + 3] + src[i + w3]) + \
325
+ blurmat[13] * (src[i - w2 - 2] + src[i - w2 + 2] + src[i + w2 - 2] + src[i + w2 + 2]) + \
326
+ blurmat[14] * ((src[i - w2 - 1] + src[i - w2 + 1]) + (src[i - w1 - 2] + src[i - w1 + 2]) + (src[i + w1 - 2] + src[i + w1 + 2]) + (src[i + w2 - 1] + src[i + w2 + 1])) + \
327
+ blurmat[15] * (src[i - w2] + src[i - 2] + src[i + 2] + src[i + w2]) + \
328
+ blurmat[16] * (src[i - w1 - 1] + src[i - w1 + 1] + src[i + w1 - 1] + src[i + w1 + 1]) + \
329
+ blurmat[17] * (src[i - w1] + src[i - 1] + src[i + 1] + src[i + w1]) + \
330
+ blurmat[18] * src[i] )
331
+
191
332
void dt_masks_calc_rawdetail_mask (float * const restrict src , float * const restrict mask , float * const restrict tmp ,
192
333
const int width , const int height , const dt_aligned_pixel_t wb )
193
334
{
@@ -244,7 +385,7 @@ void dt_masks_calc_detail_mask(float *const restrict src, float *const restrict
244
385
#ifdef _OPENMP
245
386
#pragma omp parallel for simd default(none) \
246
387
dt_omp_firstprivate(src, tmp, msize, threshold, detail) \
247
- schedule(simd:static) aligned(src, tmp : 64)
388
+ schedule(simd:static) aligned(src, tmp, out : 64)
248
389
#endif
249
390
for (int idx = 0 ; idx < msize ; idx ++ )
250
391
{
@@ -253,3 +394,7 @@ void dt_masks_calc_detail_mask(float *const restrict src, float *const restrict
253
394
}
254
395
dt_masks_blur_9x9 (tmp , out , width , height , 2.0f );
255
396
}
397
+ #undef FAST_BLUR_5
398
+ #undef FAST_BLUR_9
399
+ #undef FAST_BLUR_13
400
+
0 commit comments