Skip to content

Commit 81c6a2e

Browse files
committed
improved absdiffX
1 parent 6913b87 commit 81c6a2e

5 files changed

+47
-10
lines changed

avx_mathfun.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,7 @@ static inline v8sf cos256_ps(v8sf x)
432432
it is almost as fast, and gives you a free cosine with your sine */
433433
static inline void sincos256_ps(v8sf x, v8sf *s, v8sf *c)
434434
{
435-
v8sf xmm1, xmm2, sign_bit_sin, y;
435+
v8sf xmm1, xmm2, xmm3, sign_bit_sin, y;
436436
v8si imm0, imm2, imm4;
437437

438438
#ifndef __AVX2__

simd_utils_altivec_int32.h

+4
Original file line numberDiff line numberDiff line change
@@ -815,9 +815,13 @@ static inline v8ss vec_absdiff(v8ss a, v8ss b)
815815
cmp = vec_cmpgt(a, b);
816816
difab = vec_sub(a, b);
817817
difba = vec_sub(b, a);
818+
#if 1 // should be faster
819+
return vec_sel(difba, difab, cmp);
820+
#else
818821
difab = vec_and(*(v8ss *) &cmp, difab);
819822
difba = vec_andc(difba, *(v8ss *) &cmp);
820823
return vec_or(difab, difba);
824+
#endif
821825
}
822826

823827
static inline void absdiff16s_128s(int16_t *src1, int16_t *src2, int16_t *dst, int len)

simd_utils_avx512_int32.h

+24-9
Original file line numberDiff line numberDiff line change
@@ -282,44 +282,59 @@ static inline void fast_copy512s_4(int32_t *src, int32_t *dst, int len)
282282
}
283283
}
284284

285-
// to be improved?
286285
static inline __m512i _mm512_absdiff_epi16(__m512i a, __m512i b)
287286
{
288287
__m512i cmp, difab, difba;
289-
__m512i zero = _mm512_setzero_epi32();
290-
__mmask64 cmp_mask = _mm512_cmpgt_epi16_mask(a, b);
291-
cmp = _mm512_mask_set1_epi16(zero, cmp_mask, 0xFFFF);
288+
__mmask32 cmp_mask = _mm512_cmpgt_epi16_mask(a, b);
289+
292290
difab = _mm512_sub_epi16(a, b);
293291
difba = _mm512_sub_epi16(b, a);
292+
#if 1 // should be faster
293+
return _mm512_mask_blend_epi16(cmp_mask, difba, difab);
294+
#else
295+
__m512i zero = _mm512_setzero_epi32();
296+
cmp = _mm512_mask_set1_epi16(zero, cmp_mask, 0xFFFF);
294297
difab = _mm512_and_si512(cmp, difab);
295298
difba = _mm512_andnot_si512(cmp, difba);
296299
return _mm512_or_si512(difab, difba);
300+
#endif
301+
297302
}
298303

299304
static inline __m512i _mm512_absdiff_epi32(__m512i a, __m512i b)
300305
{
301306
__m512i cmp, difab, difba;
302-
__m512i zero = _mm512_setzero_epi32();
303-
__mmask64 cmp_mask = _mm512_cmpgt_epi32_mask(a, b);
304-
cmp = _mm512_mask_set1_epi32(zero, cmp_mask, 0xFFFFFFFF);
307+
__mmask16 cmp_mask = _mm512_cmpgt_epi32_mask(a, b);
308+
305309
difab = _mm512_sub_epi32(a, b);
306310
difba = _mm512_sub_epi32(b, a);
311+
#if 1 // should be faster
312+
return _mm512_mask_blend_epi32(cmp_mask, difba, difab);
313+
#else
314+
__m512i zero = _mm512_setzero_epi32();
315+
cmp = _mm512_mask_set1_epi32(zero, cmp_mask, 0xFFFFFFFF);
307316
difab = _mm512_and_si512(cmp, difab);
308317
difba = _mm512_andnot_si512(cmp, difba);
309318
return _mm512_or_si512(difab, difba);
319+
#endif
310320
}
311321

312322
static inline __m512i _mm512_absdiff_epi8(__m512i a, __m512i b)
313323
{
314324
__m512i cmp, difab, difba;
315-
__m512i zero = _mm512_setzero_epi32();
316325
__mmask64 cmp_mask = _mm512_cmpgt_epi8_mask(a, b);
317-
cmp = _mm512_mask_set1_epi8(zero, cmp_mask, 0xFF);
326+
318327
difab = _mm512_sub_epi8(a, b);
319328
difba = _mm512_sub_epi8(b, a);
329+
#if 1 // should be faster
330+
return _mm512_mask_blend_epi32(cmp_mask, difba, difab);
331+
#else
332+
__m512i zero = _mm512_setzero_epi32();
333+
cmp = _mm512_mask_set1_epi8(zero, cmp_mask, 0xFF);
320334
difab = _mm512_and_si512(cmp, difab);
321335
difba = _mm512_andnot_si512(cmp, difba);
322336
return _mm512_or_si512(difab, difba);
337+
#endif
323338
}
324339

325340
static inline void absdiff16s_512s(int16_t *src1, int16_t *src2, int16_t *dst, int len)

simd_utils_avx_int32.h

+13
Original file line numberDiff line numberDiff line change
@@ -282,9 +282,14 @@ static inline __m256i _mm256_absdiff_epi16(__m256i a, __m256i b)
282282
cmp = _mm256_cmpgt_epi16(a, b);
283283
difab = _mm256_sub_epi16(a, b);
284284
difba = _mm256_sub_epi16(b, a);
285+
#if 1 // should be faster
286+
return _mm256_blendv_epi8(difba, difab, cmp);
287+
#else
285288
difab = _mm256_and_si256(cmp, difab);
286289
difba = _mm256_andnot_si256(cmp, difba);
287290
return _mm256_or_si256(difab, difba);
291+
#endif
292+
288293
}
289294

290295
static inline __m256i _mm256_absdiff_epi32(__m256i a, __m256i b)
@@ -293,9 +298,13 @@ static inline __m256i _mm256_absdiff_epi32(__m256i a, __m256i b)
293298
cmp = _mm256_cmpgt_epi32(a, b);
294299
difab = _mm256_sub_epi32(a, b);
295300
difba = _mm256_sub_epi32(b, a);
301+
#if 1 // should be faster
302+
return _mm256_blendv_epi8(difba, difab, cmp);
303+
#else
296304
difab = _mm256_and_si256(cmp, difab);
297305
difba = _mm256_andnot_si256(cmp, difba);
298306
return _mm256_or_si256(difab, difba);
307+
#endif
299308
}
300309

301310
static inline __m256i _mm256_absdiff_epi8(__m256i a, __m256i b)
@@ -304,9 +313,13 @@ static inline __m256i _mm256_absdiff_epi8(__m256i a, __m256i b)
304313
cmp = _mm256_cmpgt_epi8(a, b);
305314
difab = _mm256_sub_epi8(a, b);
306315
difba = _mm256_sub_epi8(b, a);
316+
#if 1 // should be faster
317+
return _mm256_blendv_epi8(difba, difab, cmp);
318+
#else
307319
difab = _mm256_and_si256(cmp, difab);
308320
difba = _mm256_andnot_si256(cmp, difba);
309321
return _mm256_or_si256(difab, difba);
322+
#endif
310323
}
311324

312325
static inline void absdiff16s_256s(int16_t *src1, int16_t *src2, int16_t *dst, int len)

simd_utils_sse_int32.h

+5
Original file line numberDiff line numberDiff line change
@@ -305,9 +305,14 @@ static inline __m128i _mm_absdiff_epi16(__m128i a, __m128i b)
305305
cmp = _mm_cmpgt_epi16(a, b);
306306
difab = _mm_sub_epi16(a, b);
307307
difba = _mm_sub_epi16(b, a);
308+
#if 1 // should be faster
309+
return _mm_blendv_epi8(difba, difab, cmp);
310+
#else
308311
difab = _mm_and_si128(cmp, difab);
309312
difba = _mm_andnot_si128(cmp, difba);
310313
return _mm_or_si128(difab, difba);
314+
#endif
315+
311316
#else
312317
return vreinterpretq_m128i_s16(vabdq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
313318
#endif

0 commit comments

Comments
 (0)