Skip to content

Commit a8e88e4

Browse files
committed
added multiple Altivec functions
1 parent 88fe89d commit a8e88e4

File tree

4 files changed

+234
-7
lines changed

4 files changed

+234
-7
lines changed

README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -105,9 +105,9 @@ The following table is a work in progress, "?" means there is not yet an impleme
105105
| dotXf (a) | dotf_C | ippsDotProd_32f | ? |
106106
| dotcXf (a) | dotcf_C | ippsDotProd_32fc | ? |
107107
| vectorSlopeXf (a) | vectorSlopef_C | ippsVectorSlope_32f | vectorSlopef_vec |
108-
| convertFloat32ToU8_X | convertFloat32ToU8_C | ippsConvert_32f8u_Sfs | ? |
109-
| convertFloat32ToU16_X | convertFloat32ToI16_C | ippsConvert_32f16u_Sfs | ? |
110-
| convertFloat32ToI16_X | convertFloat32ToI16_C | ippsConvert_32f16s_Sfs | ? |
108+
| convertFloat32ToU8_X (a) | convertFloat32ToU8_C | ippsConvert_32f8u_Sfs | ? |
109+
| convertFloat32ToU16_X (a) | convertFloat32ToI16_C | ippsConvert_32f16u_Sfs | ? |
110+
| convertFloat32ToI16_X (a) | convertFloat32ToI16_C | ippsConvert_32f16s_Sfs | ? |
111111
| convertInt16ToFloat32_X | convertInt16ToFloat32_C | ippsConvert_16s32f_Sfs | ? |
112112
| cplxtorealXf (a) | cplxtorealf_C | ippsCplxToReal_32fc | cplxtorealf_vec |
113113
| realtocplxXf (a) | realtocplx_C | ippsRealToCplx_32f | realtocplxf_vec |

simd_test.c

+4-4
Original file line numberDiff line numberDiff line change
@@ -9559,7 +9559,7 @@ for (int i = 0; i < len; i++){
95599559

95609560
#endif
95619561

9562-
#ifdef SSE
9562+
#if defined(SSE) || defined (ALTIVEC)
95639563
clock_gettime(CLOCK_REALTIME, &start);
95649564
convertFloat32ToU8_128(inout, inout_u1, len, RndZero, 4);
95659565
clock_gettime(CLOCK_REALTIME, &stop);
@@ -9652,7 +9652,7 @@ for (int i = 0; i < len; i++){
96529652
l2_err_i16(inout_s1, inout_s2, len);
96539653
#endif
96549654

9655-
#ifdef SSE
9655+
#if defined(SSE) || defined(ALTIVEC)
96569656
clock_gettime(CLOCK_REALTIME, &start);
96579657
convertFloat32ToI16_128(inout, inout_s2, len, RndZero, 4);
96589658
clock_gettime(CLOCK_REALTIME, &stop);
@@ -9742,7 +9742,7 @@ for (int i = 0; i < len; i++){
97429742
l2_err_i16(inout_s1, inout_s2, len);
97439743
#endif
97449744

9745-
#ifdef SSE
9745+
#if defined(SSE) || defined(ALTIVEC)
97469746
clock_gettime(CLOCK_REALTIME, &start);
97479747
convertFloat32ToU16_128(inout, (uint16_t *) inout_s2, len, RndZero, 4);
97489748
clock_gettime(CLOCK_REALTIME, &stop);
@@ -9832,7 +9832,7 @@ for (int i = 0; i < len; i++){
98329832
l2_err(inout_ref, inout2_ref, len);
98339833
#endif
98349834

9835-
#ifdef SSE
9835+
#if defined(SSE) // || defined(ALTIVEC)
98369836
clock_gettime(CLOCK_REALTIME, &start);
98379837
convertInt16ToFloat32_128(inout_s1, inout_ref, len, 4);
98389838
clock_gettime(CLOCK_REALTIME, &stop);

simd_utils_altivec_float.h

+226
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@
1212
#include <stdint.h>
1313
#include <string.h>
1414

15+
#ifndef __MACH__
16+
#include "fpu_control.h"
17+
#endif
18+
1519
// In Altivec there is only mad and not mul
1620
static inline v4sf vec_mul(v4sf a, v4sf b)
1721
{
@@ -3862,3 +3866,225 @@ static inline void dotc128f(complex32_t *src1, complex32_t *src2, int len, compl
38623866
dst->re = dst_tmp.re;
38633867
dst->im = dst_tmp.im;
38643868
}
3869+
3870+
#ifndef __MACH__
3871+
static inline void convertFloat32ToU8_128(float *src, uint8_t *dst, int len, int rounding_mode, int scale_factor)
3872+
{
3873+
int stop_len = len / (4 * ALTIVEC_LEN_FLOAT);
3874+
stop_len *= (4 * ALTIVEC_LEN_FLOAT);
3875+
3876+
float scale_fact_mult = 1.0f / (float) (1 << scale_factor);
3877+
v4sf scale_fact_vec = vec_splats(scale_fact_mult);
3878+
3879+
fpu_control_t _mm_rounding_ori; // save rounding mode
3880+
_FPU_GETCW(_mm_rounding_ori);
3881+
3882+
int rounding_ori = fegetround();
3883+
3884+
if (rounding_mode == RndZero) {
3885+
_FPU_SETCW(_FPU_RC_ZERO | _FPU_DEFAULT); // rounding_vec = ROUNDTOZERO;
3886+
fesetround(FE_TOWARDZERO);
3887+
} else if (rounding_mode == RndFinancial) { // nothing to do, Default bankers rounding => round to nearest even
3888+
} else {
3889+
_FPU_SETCW(_FPU_RC_NEAREST | _FPU_DEFAULT); // rounding_vec = ROUNDTONEAREST;
3890+
fesetround(FE_TONEAREST);
3891+
}
3892+
3893+
if (areAligned2((uintptr_t) (src), (uintptr_t) (dst), ALTIVEC_LEN_BYTES)) {
3894+
for (int i = 0; i < stop_len; i += 4 * ALTIVEC_LEN_FLOAT) {
3895+
v4sf src_tmp1 = vec_ld(0, src + i);
3896+
v4sf src_tmp2 = vec_ld(0, src + i + ALTIVEC_LEN_FLOAT);
3897+
v4sf src_tmp3 = vec_ld(0, src + i + 2 * ALTIVEC_LEN_FLOAT);
3898+
v4sf src_tmp4 = vec_ld(0, src + i + 3 * ALTIVEC_LEN_FLOAT);
3899+
v4sf tmp1 = vec_mul(src_tmp1, scale_fact_vec);
3900+
v4sf tmp2 = vec_mul(src_tmp2, scale_fact_vec);
3901+
v4sf tmp3 = vec_mul(src_tmp3, scale_fact_vec);
3902+
v4sf tmp4 = vec_mul(src_tmp4, scale_fact_vec);
3903+
v4si tmp1_int = vec_cts(tmp1, 0);
3904+
v4si tmp2_int = vec_cts(tmp2, 0);
3905+
v4si tmp3_int = vec_cts(tmp3, 0);
3906+
v4si tmp4_int = vec_cts(tmp4, 0);
3907+
v8ss tmp5 = vec_packs(tmp1_int, tmp2_int);
3908+
v8ss tmp6 = vec_packs(tmp3_int, tmp4_int);
3909+
v16u8 tmp7 = vec_packsu(tmp5, tmp6);
3910+
vec_st(tmp7, 0, dst + i);
3911+
}
3912+
} else {
3913+
//TODO
3914+
}
3915+
3916+
if (rounding_mode == RndFinancial) {
3917+
for (int i = stop_len; i < len; i++) {
3918+
float tmp = (roundf(src[i] * scale_fact_mult * 0.5f) / 2.0f);
3919+
dst[i] = (uint8_t) (tmp > 255.0f ? 255.0f : tmp); // round to nearest even with round(x/2)*2
3920+
}
3921+
} else {
3922+
// Default round toward zero
3923+
for (int i = stop_len; i < len; i++) {
3924+
float tmp = nearbyintf(src[i] * scale_fact_mult);
3925+
dst[i] = (uint8_t) (tmp > 255.0f ? 255.0f : tmp);
3926+
}
3927+
_FPU_SETCW(_mm_rounding_ori); // restore previous rounding mode
3928+
fesetround(rounding_ori);
3929+
}
3930+
}
3931+
3932+
static inline void convertFloat32ToI16_128(float *src, int16_t *dst, int len, int rounding_mode, int scale_factor)
3933+
{
3934+
int stop_len = len / (4 * ALTIVEC_LEN_FLOAT);
3935+
stop_len *= (4 * ALTIVEC_LEN_FLOAT);
3936+
3937+
float scale_fact_mult = 1.0f / (float) (1 << scale_factor);
3938+
v4sf scale_fact_vec = vec_splats(scale_fact_mult);
3939+
3940+
fpu_control_t _mm_rounding_ori; // save rounding mode
3941+
_FPU_GETCW(_mm_rounding_ori);
3942+
3943+
int rounding_ori = fegetround();
3944+
3945+
if (rounding_mode == RndZero) {
3946+
_FPU_SETCW(_FPU_RC_ZERO | _FPU_DEFAULT); // rounding_vec = ROUNDTOZERO;
3947+
fesetround(FE_TOWARDZERO);
3948+
} else if (rounding_mode == RndFinancial) { // nothing to do, Default bankers rounding => round to nearest even
3949+
} else {
3950+
_FPU_SETCW(_FPU_RC_NEAREST | _FPU_DEFAULT); // rounding_vec = ROUNDTONEAREST;
3951+
fesetround(FE_TONEAREST);
3952+
}
3953+
3954+
if (areAligned2((uintptr_t) (src), (uintptr_t) (dst), ALTIVEC_LEN_BYTES)) {
3955+
for (int i = 0; i < stop_len; i += 4 * ALTIVEC_LEN_FLOAT) {
3956+
v4sf src_tmp1 = vec_ld(0, src + i);
3957+
v4sf src_tmp2 = vec_ld(0, src + i + ALTIVEC_LEN_FLOAT);
3958+
v4sf src_tmp3 = vec_ld(0, src + i + 2 * ALTIVEC_LEN_FLOAT);
3959+
v4sf src_tmp4 = vec_ld(0, src + i + 3 * ALTIVEC_LEN_FLOAT);
3960+
v4sf tmp1 = vec_mul(src_tmp1, scale_fact_vec);
3961+
v4sf tmp2 = vec_mul(src_tmp2, scale_fact_vec);
3962+
v4sf tmp3 = vec_mul(src_tmp3, scale_fact_vec);
3963+
v4sf tmp4 = vec_mul(src_tmp4, scale_fact_vec);
3964+
v4si tmp1_int = vec_cts(tmp1, 0);
3965+
v4si tmp2_int = vec_cts(tmp2, 0);
3966+
v4si tmp3_int = vec_cts(tmp3, 0);
3967+
v4si tmp4_int = vec_cts(tmp4, 0);
3968+
v8ss tmp5 = vec_packs(tmp1_int, tmp2_int);
3969+
v8ss tmp6 = vec_packs(tmp3_int, tmp4_int);
3970+
vec_st(tmp5, 0, dst + i);
3971+
vec_st(tmp6, 0, dst + i + ALTIVEC_LEN_INT16);
3972+
}
3973+
} else {
3974+
//TODO
3975+
}
3976+
3977+
if (rounding_mode == RndFinancial) {
3978+
for (int i = stop_len; i < len; i++) {
3979+
float tmp = (roundf(src[i] * scale_fact_mult * 0.5f) / 2.0f);
3980+
dst[i] = (int16_t) (tmp > 32767.0f ? 32767.0f : tmp); // round to nearest even with round(x/2)*2
3981+
}
3982+
} else {
3983+
// Default round toward zero
3984+
for (int i = stop_len; i < len; i++) {
3985+
float tmp = nearbyintf(src[i] * scale_fact_mult);
3986+
dst[i] = (int16_t) (tmp > 32767.0f ? 32767.0f : tmp);
3987+
}
3988+
_FPU_SETCW(_mm_rounding_ori); // restore previous rounding mode
3989+
fesetround(rounding_ori);
3990+
}
3991+
}
3992+
3993+
static inline void convertFloat32ToU16_128(float *src, uint16_t *dst, int len, int rounding_mode, int scale_factor)
3994+
{
3995+
int stop_len = len / (4 * ALTIVEC_LEN_FLOAT);
3996+
stop_len *= (4 * ALTIVEC_LEN_FLOAT);
3997+
3998+
float scale_fact_mult = 1.0f / (float) (1 << scale_factor);
3999+
v4sf scale_fact_vec = vec_splats(scale_fact_mult);
4000+
4001+
fpu_control_t _mm_rounding_ori; // save rounding mode
4002+
_FPU_GETCW(_mm_rounding_ori);
4003+
4004+
int rounding_ori = fegetround();
4005+
4006+
if (rounding_mode == RndZero) {
4007+
_FPU_SETCW(_FPU_RC_ZERO | _FPU_DEFAULT); // rounding_vec = ROUNDTOZERO;
4008+
fesetround(FE_TOWARDZERO);
4009+
} else if (rounding_mode == RndFinancial) { // nothing to do, Default bankers rounding => round to nearest even
4010+
} else {
4011+
_FPU_SETCW(_FPU_RC_NEAREST | _FPU_DEFAULT); // rounding_vec = ROUNDTONEAREST;
4012+
fesetround(FE_TONEAREST);
4013+
}
4014+
4015+
if (areAligned2((uintptr_t) (src), (uintptr_t) (dst), ALTIVEC_LEN_BYTES)) {
4016+
for (int i = 0; i < stop_len; i += 4 * ALTIVEC_LEN_FLOAT) {
4017+
v4sf src_tmp1 = vec_ld(0, src + i);
4018+
v4sf src_tmp2 = vec_ld(0, src + i + ALTIVEC_LEN_FLOAT);
4019+
v4sf src_tmp3 = vec_ld(0, src + i + 2 * ALTIVEC_LEN_FLOAT);
4020+
v4sf src_tmp4 = vec_ld(0, src + i + 3 * ALTIVEC_LEN_FLOAT);
4021+
v4sf tmp1 = vec_mul(src_tmp1, scale_fact_vec);
4022+
v4sf tmp2 = vec_mul(src_tmp2, scale_fact_vec);
4023+
v4sf tmp3 = vec_mul(src_tmp3, scale_fact_vec);
4024+
v4sf tmp4 = vec_mul(src_tmp4, scale_fact_vec);
4025+
v4si tmp1_int = vec_cts(tmp1, 0);
4026+
v4si tmp2_int = vec_cts(tmp2, 0);
4027+
v4si tmp3_int = vec_cts(tmp3, 0);
4028+
v4si tmp4_int = vec_cts(tmp4, 0);
4029+
v8us tmp5 = vec_packsu(tmp1_int, tmp2_int);
4030+
v8us tmp6 = vec_packsu(tmp3_int, tmp4_int);
4031+
vec_st(tmp5, 0, dst + i);
4032+
vec_st(tmp6, 0, dst + i + ALTIVEC_LEN_INT16);
4033+
}
4034+
} else {
4035+
//TODO
4036+
}
4037+
4038+
if (rounding_mode == RndFinancial) {
4039+
for (int i = stop_len; i < len; i++) {
4040+
float tmp = (roundf(src[i] * scale_fact_mult * 0.5f) / 2.0f);
4041+
dst[i] = (uint16_t) (tmp > 65535.0f ? 65535.0f : tmp); // round to nearest even with round(x/2)*2
4042+
}
4043+
} else {
4044+
// Default round toward zero
4045+
for (int i = stop_len; i < len; i++) {
4046+
float tmp = nearbyintf(src[i] * scale_fact_mult);
4047+
dst[i] = (uint16_t) (tmp > 65535.0f ? 65535.0f : tmp); // round to nearest even with round(x/2)*2
4048+
}
4049+
_FPU_SETCW(_mm_rounding_ori); // restore previous rounding mode
4050+
fesetround(rounding_ori);
4051+
}
4052+
}
4053+
4054+
/*
4055+
static inline void convertInt16ToFloat32_128(int16_t *src, float *dst, int len, int scale_factor)
4056+
{
4057+
int stop_len = len / (2 * ALTIVEC_LEN_FLOAT);
4058+
stop_len *= (2 * ALTIVEC_LEN_FLOAT);
4059+
4060+
float scale_fact_mult = 1.0f / (float) (1 << scale_factor);
4061+
v4sf scale_fact_vec = vec_splats(scale_fact_mult);
4062+
4063+
if (areAligned2((uintptr_t) (src), (uintptr_t) (dst), ALTIVEC_LEN_BYTES)) {
4064+
for (int i = 0; i < stop_len; i += 2 * ALTIVEC_LEN_FLOAT) {
4065+
v8ss vec = vec_ld(0, src + i); // loads 1 2 3 4 5 6 7 8 8
4066+
v8ss low = vec_mergeh(vec, vec); // low 1 1 2 2 3 3 4 4
4067+
v8ss high = vec_mergel(vec, vec); // high 5 5 6 6 7 7 8 8
4068+
v4ui shift = vec_splats((unsigned int)16);
4069+
v16u8 lowu = vec_sra(*(v16u8*)&low, *(v16u8*)&shift); // make low 1 -1 2 -1 3 -1 4 -4
4070+
v16u8 highu = vec_sra(*(v16u8*)&high, *(v16u8*)&shift); // make high 5 -1 6 -1 7 -1 8 -1
4071+
v4sf lowf = vec_ctf(*(v4si*)&lowu, 0);
4072+
v4sf highf = vec_ctf(*(v4si*)&highu, 0);
4073+
4074+
// convert the vector to float and scale it
4075+
v4sf floatlo = vec_mul(lowf, scale_fact_vec);
4076+
v4sf floathi = vec_mul(highf, scale_fact_vec);
4077+
4078+
vec_st(floatlo, 0, dst + i);
4079+
vec_st(floathi, 0, dst + i + ALTIVEC_LEN_FLOAT);
4080+
}
4081+
} else {
4082+
//TODO
4083+
}
4084+
4085+
for (int i = stop_len; i < len; i++) {
4086+
dst[i] = (float) src[i] * scale_fact_mult;
4087+
}
4088+
}
4089+
*/
4090+
#endif

simd_utils_constants.h

+1
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ typedef __vector float v4sf;
181181
typedef __vector int v4si;
182182
typedef __vector unsigned int v4ui;
183183
typedef __vector short v8ss;
184+
typedef __vector unsigned short v8us;
184185
typedef __vector unsigned char v16u8;
185186
typedef __vector char v16s8;
186187
typedef __vector bool int v4bi;

0 commit comments

Comments
 (0)