Skip to content

Commit 296d6c2

Browse files
committed
added multiple Altivec functions
1 parent a8e88e4 commit 296d6c2

6 files changed

+241
-17
lines changed

README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ The following table is a work in progress, "?" means there is not yet an impleme
108108
| convertFloat32ToU8_X (a) | convertFloat32ToU8_C | ippsConvert_32f8u_Sfs | ? |
109109
| convertFloat32ToU16_X (a) | convertFloat32ToI16_C | ippsConvert_32f16u_Sfs | ? |
110110
| convertFloat32ToI16_X (a) | convertFloat32ToI16_C | ippsConvert_32f16s_Sfs | ? |
111-
| convertInt16ToFloat32_X | convertInt16ToFloat32_C | ippsConvert_16s32f_Sfs | ? |
111+
| convertInt16ToFloat32_X (a) | convertInt16ToFloat32_C | ippsConvert_16s32f_Sfs | ? |
112112
| cplxtorealXf (a) | cplxtorealf_C | ippsCplxToReal_32fc | cplxtorealf_vec |
113113
| realtocplxXf (a) | realtocplx_C | ippsRealToCplx_32f | realtocplxf_vec |
114114
| convertX_64f32f | convert_64f32f_C | ippsConvert_64f32f | convert_64f32f_vec |
@@ -201,7 +201,7 @@ The following table is a work in progress, "?" means there is not yet an impleme
201201
| copyXs (a) | copys_C | ippsCopy_32s | copys_vec |
202202
| ? | ? | ? | mulcs_vec |
203203
| absdiff16s_Xs (a) | absdiff16s_c | ? | ? |
204-
| sum16s32sX | sum16s32s_C | ippsSum_16s32s_Sfs | ? |
204+
| sum16s32sX (a) | sum16s32s_C | ippsSum_16s32s_Sfs | ? |
205205
| ? | ors_c | ippsOr_32u | ? |
206206
| ? | ands_c | ippsAnd_32u | ? |
207207
| sigmoidXf | sigmoidf_C | ? | ? |

simd_test.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -9832,7 +9832,7 @@ for (int i = 0; i < len; i++){
98329832
l2_err(inout_ref, inout2_ref, len);
98339833
#endif
98349834

9835-
#if defined(SSE) // || defined(ALTIVEC)
9835+
#if defined(SSE) || defined(ALTIVEC)
98369836
clock_gettime(CLOCK_REALTIME, &start);
98379837
convertInt16ToFloat32_128(inout_s1, inout_ref, len, 4);
98389838
clock_gettime(CLOCK_REALTIME, &stop);
@@ -10518,7 +10518,7 @@ for (int i = 0; i < len; i++){
1051810518
printf("%d %d\n", inout_iref[0], inout_i1[0]);
1051910519
#endif
1052010520

10521-
#ifdef SSE
10521+
#if defined(SSE) || defined(ALTIVEC)
1052210522
clock_gettime(CLOCK_REALTIME, &start);
1052310523
sum16s32s128(inout_s1, len, &inout_i1[0], 3);
1052410524
clock_gettime(CLOCK_REALTIME, &stop);

simd_utils_altivec_float.h

+143-12
Original file line numberDiff line numberDiff line change
@@ -3910,7 +3910,41 @@ static inline void convertFloat32ToU8_128(float *src, uint8_t *dst, int len, int
39103910
vec_st(tmp7, 0, dst + i);
39113911
}
39123912
} else {
3913-
//TODO
3913+
int unalign_src = (uintptr_t) (src) % ALTIVEC_LEN_BYTES;
3914+
int unalign_dst = (uintptr_t) (dst) % ALTIVEC_LEN_BYTES;
3915+
3916+
for (int i = 0; i < stop_len; i += 4 * ALTIVEC_LEN_FLOAT) {
3917+
v4sf src_tmp1, src_tmp2, src_tmp3, src_tmp4;
3918+
if (unalign_src) {
3919+
src_tmp1 = (v4sf) vec_ldu((unsigned char *) (src + i));
3920+
src_tmp2 = (v4sf) vec_ldu((unsigned char *) (src + i + ALTIVEC_LEN_FLOAT));
3921+
src_tmp3 = (v4sf) vec_ldu((unsigned char *) (src + i + 2 * ALTIVEC_LEN_FLOAT));
3922+
src_tmp4 = (v4sf) vec_ldu((unsigned char *) (src + i + 3 * ALTIVEC_LEN_FLOAT));
3923+
} else {
3924+
src_tmp1 = vec_ld(0, src + i);
3925+
src_tmp2 = vec_ld(0, src + i + ALTIVEC_LEN_FLOAT);
3926+
src_tmp3 = vec_ld(0, src + i + 2 * ALTIVEC_LEN_FLOAT);
3927+
src_tmp4 = vec_ld(0, src + i + 3 * ALTIVEC_LEN_FLOAT);
3928+
}
3929+
3930+
v4sf tmp1 = vec_mul(src_tmp1, scale_fact_vec);
3931+
v4sf tmp2 = vec_mul(src_tmp2, scale_fact_vec);
3932+
v4sf tmp3 = vec_mul(src_tmp3, scale_fact_vec);
3933+
v4sf tmp4 = vec_mul(src_tmp4, scale_fact_vec);
3934+
v4si tmp1_int = vec_cts(tmp1, 0);
3935+
v4si tmp2_int = vec_cts(tmp2, 0);
3936+
v4si tmp3_int = vec_cts(tmp3, 0);
3937+
v4si tmp4_int = vec_cts(tmp4, 0);
3938+
v8ss tmp5 = vec_packs(tmp1_int, tmp2_int);
3939+
v8ss tmp6 = vec_packs(tmp3_int, tmp4_int);
3940+
v16u8 tmp7 = vec_packsu(tmp5, tmp6);
3941+
3942+
if (unalign_dst) {
3943+
vec_stu(*(v16u8 *) &tmp7, (unsigned char *) (dst + i));
3944+
} else {
3945+
vec_st(tmp7, 0, dst + i);
3946+
}
3947+
}
39143948
}
39153949

39163950
if (rounding_mode == RndFinancial) {
@@ -3971,7 +4005,42 @@ static inline void convertFloat32ToI16_128(float *src, int16_t *dst, int len, in
39714005
vec_st(tmp6, 0, dst + i + ALTIVEC_LEN_INT16);
39724006
}
39734007
} else {
3974-
//TODO
4008+
int unalign_src = (uintptr_t) (src) % ALTIVEC_LEN_BYTES;
4009+
int unalign_dst = (uintptr_t) (dst) % ALTIVEC_LEN_BYTES;
4010+
4011+
for (int i = 0; i < stop_len; i += 4 * ALTIVEC_LEN_FLOAT) {
4012+
v4sf src_tmp1, src_tmp2, src_tmp3, src_tmp4;
4013+
if (unalign_src) {
4014+
src_tmp1 = (v4sf) vec_ldu((unsigned char *) (src + i));
4015+
src_tmp2 = (v4sf) vec_ldu((unsigned char *) (src + i + ALTIVEC_LEN_FLOAT));
4016+
src_tmp3 = (v4sf) vec_ldu((unsigned char *) (src + i + 2 * ALTIVEC_LEN_FLOAT));
4017+
src_tmp4 = (v4sf) vec_ldu((unsigned char *) (src + i + 3 * ALTIVEC_LEN_FLOAT));
4018+
} else {
4019+
src_tmp1 = vec_ld(0, src + i);
4020+
src_tmp2 = vec_ld(0, src + i + ALTIVEC_LEN_FLOAT);
4021+
src_tmp3 = vec_ld(0, src + i + 2 * ALTIVEC_LEN_FLOAT);
4022+
src_tmp4 = vec_ld(0, src + i + 3 * ALTIVEC_LEN_FLOAT);
4023+
}
4024+
4025+
v4sf tmp1 = vec_mul(src_tmp1, scale_fact_vec);
4026+
v4sf tmp2 = vec_mul(src_tmp2, scale_fact_vec);
4027+
v4sf tmp3 = vec_mul(src_tmp3, scale_fact_vec);
4028+
v4sf tmp4 = vec_mul(src_tmp4, scale_fact_vec);
4029+
v4si tmp1_int = vec_cts(tmp1, 0);
4030+
v4si tmp2_int = vec_cts(tmp2, 0);
4031+
v4si tmp3_int = vec_cts(tmp3, 0);
4032+
v4si tmp4_int = vec_cts(tmp4, 0);
4033+
v8ss tmp5 = vec_packs(tmp1_int, tmp2_int);
4034+
v8ss tmp6 = vec_packs(tmp3_int, tmp4_int);
4035+
4036+
if (unalign_dst) {
4037+
vec_stu(*(v16u8 *) &tmp5, (unsigned char *) (dst + i));
4038+
vec_stu(*(v16u8 *) &tmp6, (unsigned char *) (dst + i + ALTIVEC_LEN_FLOAT));
4039+
} else {
4040+
vec_st(tmp5, 0, dst + i);
4041+
vec_st(tmp6, 0, dst + i + ALTIVEC_LEN_FLOAT);
4042+
}
4043+
}
39754044
}
39764045

39774046
if (rounding_mode == RndFinancial) {
@@ -4032,7 +4101,42 @@ static inline void convertFloat32ToU16_128(float *src, uint16_t *dst, int len, i
40324101
vec_st(tmp6, 0, dst + i + ALTIVEC_LEN_INT16);
40334102
}
40344103
} else {
4035-
//TODO
4104+
int unalign_src = (uintptr_t) (src) % ALTIVEC_LEN_BYTES;
4105+
int unalign_dst = (uintptr_t) (dst) % ALTIVEC_LEN_BYTES;
4106+
4107+
for (int i = 0; i < stop_len; i += 4 * ALTIVEC_LEN_FLOAT) {
4108+
v4sf src_tmp1, src_tmp2, src_tmp3, src_tmp4;
4109+
if (unalign_src) {
4110+
src_tmp1 = (v4sf) vec_ldu((unsigned char *) (src + i));
4111+
src_tmp2 = (v4sf) vec_ldu((unsigned char *) (src + i + ALTIVEC_LEN_FLOAT));
4112+
src_tmp3 = (v4sf) vec_ldu((unsigned char *) (src + i + 2 * ALTIVEC_LEN_FLOAT));
4113+
src_tmp4 = (v4sf) vec_ldu((unsigned char *) (src + i + 3 * ALTIVEC_LEN_FLOAT));
4114+
} else {
4115+
src_tmp1 = vec_ld(0, src + i);
4116+
src_tmp2 = vec_ld(0, src + i + ALTIVEC_LEN_FLOAT);
4117+
src_tmp3 = vec_ld(0, src + i + 2 * ALTIVEC_LEN_FLOAT);
4118+
src_tmp4 = vec_ld(0, src + i + 3 * ALTIVEC_LEN_FLOAT);
4119+
}
4120+
4121+
v4sf tmp1 = vec_mul(src_tmp1, scale_fact_vec);
4122+
v4sf tmp2 = vec_mul(src_tmp2, scale_fact_vec);
4123+
v4sf tmp3 = vec_mul(src_tmp3, scale_fact_vec);
4124+
v4sf tmp4 = vec_mul(src_tmp4, scale_fact_vec);
4125+
v4si tmp1_int = vec_cts(tmp1, 0);
4126+
v4si tmp2_int = vec_cts(tmp2, 0);
4127+
v4si tmp3_int = vec_cts(tmp3, 0);
4128+
v4si tmp4_int = vec_cts(tmp4, 0);
4129+
v8us tmp5 = vec_packsu(tmp1_int, tmp2_int);
4130+
v8us tmp6 = vec_packsu(tmp3_int, tmp4_int);
4131+
4132+
if (unalign_dst) {
4133+
vec_stu(*(v16u8 *) &tmp5, (unsigned char *) (dst + i));
4134+
vec_stu(*(v16u8 *) &tmp6, (unsigned char *) (dst + i + ALTIVEC_LEN_FLOAT));
4135+
} else {
4136+
vec_st(tmp5, 0, dst + i);
4137+
vec_st(tmp6, 0, dst + i + ALTIVEC_LEN_FLOAT);
4138+
}
4139+
}
40364140
}
40374141

40384142
if (rounding_mode == RndFinancial) {
@@ -4051,25 +4155,24 @@ static inline void convertFloat32ToU16_128(float *src, uint16_t *dst, int len, i
40514155
}
40524156
}
40534157

4054-
/*
40554158
static inline void convertInt16ToFloat32_128(int16_t *src, float *dst, int len, int scale_factor)
40564159
{
40574160
int stop_len = len / (2 * ALTIVEC_LEN_FLOAT);
40584161
stop_len *= (2 * ALTIVEC_LEN_FLOAT);
40594162

40604163
float scale_fact_mult = 1.0f / (float) (1 << scale_factor);
40614164
v4sf scale_fact_vec = vec_splats(scale_fact_mult);
4062-
4165+
v4ui shift = vec_splats((unsigned int)16);
4166+
40634167
if (areAligned2((uintptr_t) (src), (uintptr_t) (dst), ALTIVEC_LEN_BYTES)) {
40644168
for (int i = 0; i < stop_len; i += 2 * ALTIVEC_LEN_FLOAT) {
40654169
v8ss vec = vec_ld(0, src + i); // loads 1 2 3 4 5 6 7 8 8
40664170
v8ss low = vec_mergeh(vec, vec); // low 1 1 2 2 3 3 4 4
40674171
v8ss high = vec_mergel(vec, vec); // high 5 5 6 6 7 7 8 8
4068-
v4ui shift = vec_splats((unsigned int)16);
4069-
v16u8 lowu = vec_sra(*(v16u8*)&low, *(v16u8*)&shift); // make low 1 -1 2 -1 3 -1 4 -4
4070-
v16u8 highu = vec_sra(*(v16u8*)&high, *(v16u8*)&shift); // make high 5 -1 6 -1 7 -1 8 -1
4071-
v4sf lowf = vec_ctf(*(v4si*)&lowu, 0);
4072-
v4sf highf = vec_ctf(*(v4si*)&highu, 0);
4172+
v4si lows = vec_sra(*(v4si*)&low, shift); // make low 1 -1 2 -1 3 -1 4 -4
4173+
v4si highs = vec_sra(*(v4si*)&high, shift); // make high 5 -1 6 -1 7 -1 8 -1
4174+
v4sf lowf = vec_ctf(*(v4si*)&lows, 0);
4175+
v4sf highf = vec_ctf(*(v4si*)&highs, 0);
40734176

40744177
// convert the vector to float and scale it
40754178
v4sf floatlo = vec_mul(lowf, scale_fact_vec);
@@ -4079,12 +4182,40 @@ static inline void convertInt16ToFloat32_128(int16_t *src, float *dst, int len,
40794182
vec_st(floathi, 0, dst + i + ALTIVEC_LEN_FLOAT);
40804183
}
40814184
} else {
4082-
//TODO
4185+
int unalign_src = (uintptr_t) (src) % ALTIVEC_LEN_BYTES;
4186+
int unalign_dst = (uintptr_t) (dst) % ALTIVEC_LEN_BYTES;
4187+
4188+
for (int i = 0; i < stop_len; i += 2 * ALTIVEC_LEN_FLOAT) {
4189+
v8ss vec;
4190+
if (unalign_src) {
4191+
vec = (v8ss) vec_ldu((unsigned char *) (src + i));
4192+
} else {
4193+
vec = vec_ld(0, src + i);
4194+
}
4195+
4196+
v8ss low = vec_mergeh(vec, vec); // low 1 1 2 2 3 3 4 4
4197+
v8ss high = vec_mergel(vec, vec); // high 5 5 6 6 7 7 8 8
4198+
v4si lows = vec_sra(*(v4si*)&low, shift); // make low 1 -1 2 -1 3 -1 4 -4
4199+
v4si highs = vec_sra(*(v4si*)&high, shift); // make high 5 -1 6 -1 7 -1 8 -1
4200+
v4sf lowf = vec_ctf(*(v4si*)&lows, 0);
4201+
v4sf highf = vec_ctf(*(v4si*)&highs, 0);
4202+
4203+
// convert the vector to float and scale it
4204+
v4sf floatlo = vec_mul(lowf, scale_fact_vec);
4205+
v4sf floathi = vec_mul(highf, scale_fact_vec);
4206+
4207+
if (unalign_dst) {
4208+
vec_stu(*(v16u8 *) &floatlo, (unsigned char *) (dst + i));
4209+
vec_stu(*(v16u8 *) &floathi, (unsigned char *) (dst + i + ALTIVEC_LEN_FLOAT));
4210+
} else {
4211+
vec_st(floatlo, 0, dst + i);
4212+
vec_st(floathi, 0, dst + i + ALTIVEC_LEN_FLOAT);
4213+
}
4214+
}
40834215
}
40844216

40854217
for (int i = stop_len; i < len; i++) {
40864218
dst[i] = (float) src[i] * scale_fact_mult;
40874219
}
40884220
}
4089-
*/
40904221
#endif

simd_utils_altivec_int32.h

+58
Original file line numberDiff line numberDiff line change
@@ -796,3 +796,61 @@ static inline void absdiff16s_128s(int16_t *src1, int16_t *src2, int16_t *dst, i
796796
dst[i] = abs(src1[i] - src2[i]);
797797
}
798798
}
799+
800+
// Works with positive scale_factor (divides final value)
801+
static inline void sum16s32s128(int16_t *src, int len, int32_t *dst, int scale_factor)
802+
{
803+
int stop_len = len / (4 * ALTIVEC_LEN_INT16);
804+
stop_len *= (4 * ALTIVEC_LEN_INT16);
805+
806+
__attribute__((aligned(ALTIVEC_LEN_BYTES))) int32_t accumulate[ALTIVEC_LEN_INT32];
807+
int32_t tmp_acc = 0;
808+
int16_t scale = 1 << scale_factor;
809+
v8ss one = vec_splats(1);
810+
v4si vec_acc1 = *(v4si*)_ps_0; // initialize the vector accumulator
811+
v4si vec_acc2 = *(v4si*)_ps_0; // initialize the vector accumulator
812+
813+
if (isAligned((uintptr_t) (src), ALTIVEC_LEN_BYTES)) {
814+
for (int i = 0; i < stop_len; i += 4 * ALTIVEC_LEN_INT16) {
815+
v8ss vec_src_tmp = vec_ld(0, src + i);
816+
v8ss vec_src_tmp2 = vec_ld(0, src + i + ALTIVEC_LEN_INT16);
817+
v8ss vec_src_tmp3 = vec_ld(0, src + i + 2 * ALTIVEC_LEN_INT16);
818+
v8ss vec_src_tmp4 = vec_ld(0, src + i + 3 * ALTIVEC_LEN_INT16);
819+
v4si vec_src_tmpi = vec_msum(vec_src_tmp, one, *(v4si*)_ps_0);
820+
v4si vec_src_tmp2i = vec_msum(vec_src_tmp2, one, *(v4si*)_ps_0);
821+
v4si vec_src_tmp3i = vec_msum(vec_src_tmp3, one, *(v4si*)_ps_0);
822+
v4si vec_src_tmp4i = vec_msum(vec_src_tmp4, one, *(v4si*)_ps_0);
823+
vec_src_tmpi = vec_add(vec_src_tmpi, vec_src_tmp2i);
824+
vec_src_tmp3i = vec_add(vec_src_tmp3i, vec_src_tmp4i);
825+
vec_acc1 = vec_add(vec_src_tmpi, vec_acc1);
826+
vec_acc2 = vec_add(vec_src_tmp3i, vec_acc2);
827+
}
828+
} else {
829+
for (int i = 0; i < stop_len; i += 4 * ALTIVEC_LEN_INT16) {
830+
v8ss vec_src_tmp = (v8ss) vec_ldu((unsigned char *) (src + i));
831+
v8ss vec_src_tmp2 = (v8ss) vec_ldu((unsigned char *) (src + i + ALTIVEC_LEN_INT16));
832+
v8ss vec_src_tmp3 = (v8ss) vec_ldu((unsigned char *) (src + i + 2 * ALTIVEC_LEN_INT16));
833+
v8ss vec_src_tmp4 = (v8ss) vec_ldu((unsigned char *) (src + i + 3 * ALTIVEC_LEN_INT16));
834+
v4si vec_src_tmpi = vec_msum(vec_src_tmp, one, *(v4si*)_ps_0);
835+
v4si vec_src_tmp2i = vec_msum(vec_src_tmp2, one, *(v4si*)_ps_0);
836+
v4si vec_src_tmp3i = vec_msum(vec_src_tmp3, one, *(v4si*)_ps_0);
837+
v4si vec_src_tmp4i = vec_msum(vec_src_tmp4, one, *(v4si*)_ps_0);
838+
vec_src_tmpi = vec_add(vec_src_tmpi, vec_src_tmp2i);
839+
vec_src_tmp3i = vec_add(vec_src_tmp3i, vec_src_tmp4i);
840+
vec_acc1 = vec_add(vec_src_tmpi, vec_acc1);
841+
vec_acc2 = vec_add(vec_src_tmp3i, vec_acc2);
842+
}
843+
}
844+
845+
vec_acc1 = vec_add(vec_acc1, vec_acc2);
846+
vec_st(vec_acc1, 0, accumulate);
847+
848+
for (int i = stop_len; i < len; i++) {
849+
tmp_acc += (int32_t) src[i];
850+
}
851+
852+
tmp_acc = tmp_acc + accumulate[0] + accumulate[1] + accumulate[2] + accumulate[3];
853+
854+
tmp_acc /= scale;
855+
*dst = tmp_acc;
856+
}

simd_utils_constants.h

+35
Original file line numberDiff line numberDiff line change
@@ -221,10 +221,20 @@ typedef int32x4_t v4si; // vector of 4 uint32
221221
typedef float32x4x2_t v4sfx2;
222222
typedef float64x2x2_t v2sdx2;
223223

224+
typedef int8x16_t v8ss;
225+
typedef uint8x16_t v8us;
226+
typedef uint16x8_t v16u8;
227+
typedef uint16x8_t v16s8;
228+
224229
#else
225230

226231
typedef __m128 v4sf; // vector of 4 float (sse1)
227232
typedef __m128i v4si; // vector of 4 int (sse2)
233+
typedef __m128i v8ss;
234+
typedef __m128i v8us;
235+
typedef __m128i v16u8;
236+
typedef __m128i v16s8;
237+
228238
typedef struct {
229239
v4sf val[2];
230240
} v4sfx2;
@@ -1330,6 +1340,31 @@ static inline void print4x(v4sf v)
13301340
//printf("[%0.3f, %0.3f, %0.3f, %0.3f]", p[0], p[1], p[2], p[3]);
13311341
}
13321342

1343+
static inline void print8xs(v8ss v)
1344+
{
1345+
short *p = (short *) &v;
1346+
#ifndef __SSE2__
1347+
#ifndef ALTIVEC
1348+
_mm_empty();
1349+
#endif
1350+
#endif
1351+
printf("[%04x, %04x, %04x, %04x, %04x, %04x, %04x, %04x]",\
1352+
p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]);
1353+
}
1354+
1355+
static inline void print16xu(v16u8 v)
1356+
{
1357+
uint8_t *p = (uint8_t *) &v;
1358+
#ifndef __SSE2__
1359+
#ifndef ALTIVEC
1360+
_mm_empty();
1361+
#endif
1362+
#endif
1363+
printf("[%02x, %02x, %02x, %02x, %02x, %02x, %02x, %02x,%02x, %02x, %02x, %02x, %02x, %02x, %02x, %02x]",\
1364+
p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],\
1365+
p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
1366+
}
1367+
13331368
static inline void print4i(v4si v)
13341369
{
13351370
int *p = (int *) &v;

simd_utils_sse_int32.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,7 @@ static inline void sum16s32s128(int16_t *src, int len, int32_t *dst, int scale_f
400400
int stop_len = len / (4 * SSE_LEN_INT16);
401401
stop_len *= (4 * SSE_LEN_INT16);
402402

403-
__attribute__((aligned(SSE_LEN_BYTES))) int32_t accumulate[SSE_LEN_INT32] = {0, 0, 0, 0};
403+
__attribute__((aligned(SSE_LEN_BYTES))) int32_t accumulate[SSE_LEN_INT32];
404404
int32_t tmp_acc = 0;
405405
int16_t scale = 1 << scale_factor;
406406
v4si one = _mm_set1_epi16(1);

0 commit comments

Comments
 (0)