Skip to content

Commit acda5c9

Browse files
committed
added RISCV absdiff16s_vec
1 parent 81c6a2e commit acda5c9

5 files changed

+56
-1
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ The following table is a work in progress, "?" means there is not yet an impleme
200200
| thresholdX_ltval_gtval_s (a) | threshold_ltval_gtval_s_C | ippsThreshold_LTValGTVal_32s | threshold_ltval_gtval_s_vec |
201201
| copyXs (a) | copys_C | ippsCopy_32s | copys_vec |
202202
| ? | ? | ? | mulcs_vec |
203-
| absdiff16s_Xs (a) | absdiff16s_c | ? | ? |
203+
| absdiff16s_Xs (a) | absdiff16s_c | ? | absdiff16s_vec |
204204
| sum16s32sX (a) | sum16s32s_C | ippsSum_16s32s_Sfs | sum16s32s_vec |
205205
| ? | ors_c | ippsOr_32u | ? |
206206
| ? | ands_c | ippsAnd_32u | ? |

simd_test.c

+17
Original file line numberDiff line numberDiff line change
@@ -11197,6 +11197,23 @@ for (int i = 0; i < len; i++){
1119711197
printf("absdiff16s_512s %d %lf\n", len, elapsed);
1119811198
l2_err_i16(inout_sref, inout_s3, len);
1119911199
#endif
11200+
11201+
#ifdef RISCV
11202+
clock_gettime(CLOCK_REALTIME, &start);
11203+
absdiff16s_vec(inout_s1, inout_s2, inout_s3, len);
11204+
clock_gettime(CLOCK_REALTIME, &stop);
11205+
elapsed = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3;
11206+
printf("absdiff16s_vec %d %lf\n", len, elapsed);
11207+
11208+
clock_gettime(CLOCK_REALTIME, &start);
11209+
for (l = 0; l < loop; l++)
11210+
absdiff16s_vec(inout_s1, inout_s2, inout_s3, len);
11211+
clock_gettime(CLOCK_REALTIME, &stop);
11212+
elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
11213+
printf("absdiff16s_vec %d %lf\n", len, elapsed);
11214+
l2_err_i16(inout_sref, inout_s3, len);
11215+
#endif
11216+
1120011217
printf("\n");
1120111218
/////////////////////////////////////////////////////////// POWERSPECT_S16_INTERLEAVED //////////////////////////////////////////////////////////////////////////////
1120211219
printf("POWERSPECT_S16_INTERLEAVED\n");

simd_utils_constants.h

+6
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,13 @@ vfnmsub.vf vd, rs1, vs2, vm
268268
#define VLOAD1_SHORT vmv_v_x_i16m4
269269
#define VSTORE_SHORT vse16_v_i16m4
270270
#define VADD_SHORT vadd_vv_i16m4
271+
#define VSUB_SHORT vsub_vv_i16m4
271272
#define VREDSUMW_SHORT vwredsum_vs_i16m4_i32m1
273+
#define VGT_SHORT_BOOL vmsgt_vv_i16m4_b4
274+
#define VMERGE_SHORT vmerge_vvm_i16m4
275+
276+
//// BOOL Double
277+
#define V_ELT_BOOLD vbool4_t
272278

273279
//// BOOL
274280
#define V_ELT_BOOL vbool8_t

simd_utils_riscv_int.h

+22
Original file line numberDiff line numberDiff line change
@@ -355,3 +355,25 @@ static inline void sum16s32s_vec(int16_t *src, int len, int32_t *dst, int scale_
355355
vse32_v_i32m1(dst, tmp, 1);
356356
*dst /= scale;
357357
}
358+
359+
static inline void absdiff16s_vec(int16_t *src1, int16_t *src2, int16_t *dst, int len)
360+
{
361+
size_t i;
362+
int16_t *src1_tmp = src1;
363+
int16_t *src2_tmp = src2;
364+
int16_t *dst_tmp = dst;
365+
for (; (i = VSETVL32(len)) > 0; len -= i) {
366+
V_ELT_SHORT va, vb, vc;
367+
va = VLOAD_SHORT(src1_tmp, i);
368+
vb = VLOAD_SHORT(src2_tmp, i);
369+
370+
V_ELT_BOOLD cmp = VGT_SHORT_BOOL(va, vb, i);
371+
V_ELT_SHORT difab = VSUB_SHORT(va, vb, i);
372+
V_ELT_SHORT difba = VSUB_SHORT(vb, va, i);
373+
vc = VMERGE_SHORT(cmp, difba, difab, i);
374+
VSTORE_SHORT(dst_tmp, vc, i);
375+
src1_tmp += i;
376+
src2_tmp += i;
377+
dst_tmp += i;
378+
}
379+
}

simd_utils_sse_int32.h

+10
Original file line numberDiff line numberDiff line change
@@ -326,9 +326,14 @@ static inline __m128i _mm_absdiff_epi32(__m128i a, __m128i b)
326326
cmp = _mm_cmpgt_epi32(a, b);
327327
difab = _mm_sub_epi32(a, b);
328328
difba = _mm_sub_epi32(b, a);
329+
#if 1 // should be faster
330+
return _mm_blendv_epi8(difba, difab, cmp);
331+
#else
329332
difab = _mm_and_si128(cmp, difab);
330333
difba = _mm_andnot_si128(cmp, difba);
331334
return _mm_or_si128(difab, difba);
335+
#endif
336+
332337
#else
333338
return vreinterpretq_m128i_s32(vabdq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
334339
#endif
@@ -341,9 +346,14 @@ static inline __m128i _mm_absdiff_epi8(__m128i a, __m128i b)
341346
cmp = _mm_cmpgt_epi8(a, b);
342347
difab = _mm_sub_epi8(a, b);
343348
difba = _mm_sub_epi8(b, a);
349+
#if 1 // should be faster
350+
return _mm_blendv_epi8(difba, difab, cmp);
351+
#else
344352
difab = _mm_and_si128(cmp, difab);
345353
difba = _mm_andnot_si128(cmp, difba);
346354
return _mm_or_si128(difab, difba);
355+
#endif
356+
347357
#else
348358
return vreinterpretq_m128i_s8(vabdq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
349359
#endif

0 commit comments

Comments
 (0)