|
12 | 12 | #include <stdint.h>
|
13 | 13 | #include <string.h>
|
14 | 14 |
|
| 15 | +#ifndef __MACH__ |
| 16 | +#include "fpu_control.h" |
| 17 | +#endif |
| 18 | + |
15 | 19 | // In Altivec there is only mad and not mul
|
16 | 20 | static inline v4sf vec_mul(v4sf a, v4sf b)
|
17 | 21 | {
|
@@ -3862,3 +3866,225 @@ static inline void dotc128f(complex32_t *src1, complex32_t *src2, int len, compl
|
3862 | 3866 | dst->re = dst_tmp.re;
|
3863 | 3867 | dst->im = dst_tmp.im;
|
3864 | 3868 | }
|
| 3869 | + |
| 3870 | +#ifndef __MACH__ |
| 3871 | +static inline void convertFloat32ToU8_128(float *src, uint8_t *dst, int len, int rounding_mode, int scale_factor) |
| 3872 | +{ |
| 3873 | + int stop_len = len / (4 * ALTIVEC_LEN_FLOAT); |
| 3874 | + stop_len *= (4 * ALTIVEC_LEN_FLOAT); |
| 3875 | + |
| 3876 | + float scale_fact_mult = 1.0f / (float) (1 << scale_factor); |
| 3877 | + v4sf scale_fact_vec = vec_splats(scale_fact_mult); |
| 3878 | + |
| 3879 | + fpu_control_t _mm_rounding_ori; // save rounding mode |
| 3880 | + _FPU_GETCW(_mm_rounding_ori); |
| 3881 | + |
| 3882 | + int rounding_ori = fegetround(); |
| 3883 | + |
| 3884 | + if (rounding_mode == RndZero) { |
| 3885 | + _FPU_SETCW(_FPU_RC_ZERO | _FPU_DEFAULT); // rounding_vec = ROUNDTOZERO; |
| 3886 | + fesetround(FE_TOWARDZERO); |
| 3887 | + } else if (rounding_mode == RndFinancial) { // nothing to do, Default bankers rounding => round to nearest even |
| 3888 | + } else { |
| 3889 | + _FPU_SETCW(_FPU_RC_NEAREST | _FPU_DEFAULT); // rounding_vec = ROUNDTONEAREST; |
| 3890 | + fesetround(FE_TONEAREST); |
| 3891 | + } |
| 3892 | + |
| 3893 | + if (areAligned2((uintptr_t) (src), (uintptr_t) (dst), ALTIVEC_LEN_BYTES)) { |
| 3894 | + for (int i = 0; i < stop_len; i += 4 * ALTIVEC_LEN_FLOAT) { |
| 3895 | + v4sf src_tmp1 = vec_ld(0, src + i); |
| 3896 | + v4sf src_tmp2 = vec_ld(0, src + i + ALTIVEC_LEN_FLOAT); |
| 3897 | + v4sf src_tmp3 = vec_ld(0, src + i + 2 * ALTIVEC_LEN_FLOAT); |
| 3898 | + v4sf src_tmp4 = vec_ld(0, src + i + 3 * ALTIVEC_LEN_FLOAT); |
| 3899 | + v4sf tmp1 = vec_mul(src_tmp1, scale_fact_vec); |
| 3900 | + v4sf tmp2 = vec_mul(src_tmp2, scale_fact_vec); |
| 3901 | + v4sf tmp3 = vec_mul(src_tmp3, scale_fact_vec); |
| 3902 | + v4sf tmp4 = vec_mul(src_tmp4, scale_fact_vec); |
| 3903 | + v4si tmp1_int = vec_cts(tmp1, 0); |
| 3904 | + v4si tmp2_int = vec_cts(tmp2, 0); |
| 3905 | + v4si tmp3_int = vec_cts(tmp3, 0); |
| 3906 | + v4si tmp4_int = vec_cts(tmp4, 0); |
| 3907 | + v8ss tmp5 = vec_packs(tmp1_int, tmp2_int); |
| 3908 | + v8ss tmp6 = vec_packs(tmp3_int, tmp4_int); |
| 3909 | + v16u8 tmp7 = vec_packsu(tmp5, tmp6); |
| 3910 | + vec_st(tmp7, 0, dst + i); |
| 3911 | + } |
| 3912 | + } else { |
| 3913 | + //TODO |
| 3914 | + } |
| 3915 | + |
| 3916 | + if (rounding_mode == RndFinancial) { |
| 3917 | + for (int i = stop_len; i < len; i++) { |
| 3918 | + float tmp = (roundf(src[i] * scale_fact_mult * 0.5f) / 2.0f); |
| 3919 | + dst[i] = (uint8_t) (tmp > 255.0f ? 255.0f : tmp); // round to nearest even with round(x/2)*2 |
| 3920 | + } |
| 3921 | + } else { |
| 3922 | + // Default round toward zero |
| 3923 | + for (int i = stop_len; i < len; i++) { |
| 3924 | + float tmp = nearbyintf(src[i] * scale_fact_mult); |
| 3925 | + dst[i] = (uint8_t) (tmp > 255.0f ? 255.0f : tmp); |
| 3926 | + } |
| 3927 | + _FPU_SETCW(_mm_rounding_ori); // restore previous rounding mode |
| 3928 | + fesetround(rounding_ori); |
| 3929 | + } |
| 3930 | +} |
| 3931 | + |
| 3932 | +static inline void convertFloat32ToI16_128(float *src, int16_t *dst, int len, int rounding_mode, int scale_factor) |
| 3933 | +{ |
| 3934 | + int stop_len = len / (4 * ALTIVEC_LEN_FLOAT); |
| 3935 | + stop_len *= (4 * ALTIVEC_LEN_FLOAT); |
| 3936 | + |
| 3937 | + float scale_fact_mult = 1.0f / (float) (1 << scale_factor); |
| 3938 | + v4sf scale_fact_vec = vec_splats(scale_fact_mult); |
| 3939 | + |
| 3940 | + fpu_control_t _mm_rounding_ori; // save rounding mode |
| 3941 | + _FPU_GETCW(_mm_rounding_ori); |
| 3942 | + |
| 3943 | + int rounding_ori = fegetround(); |
| 3944 | + |
| 3945 | + if (rounding_mode == RndZero) { |
| 3946 | + _FPU_SETCW(_FPU_RC_ZERO | _FPU_DEFAULT); // rounding_vec = ROUNDTOZERO; |
| 3947 | + fesetround(FE_TOWARDZERO); |
| 3948 | + } else if (rounding_mode == RndFinancial) { // nothing to do, Default bankers rounding => round to nearest even |
| 3949 | + } else { |
| 3950 | + _FPU_SETCW(_FPU_RC_NEAREST | _FPU_DEFAULT); // rounding_vec = ROUNDTONEAREST; |
| 3951 | + fesetround(FE_TONEAREST); |
| 3952 | + } |
| 3953 | + |
| 3954 | + if (areAligned2((uintptr_t) (src), (uintptr_t) (dst), ALTIVEC_LEN_BYTES)) { |
| 3955 | + for (int i = 0; i < stop_len; i += 4 * ALTIVEC_LEN_FLOAT) { |
| 3956 | + v4sf src_tmp1 = vec_ld(0, src + i); |
| 3957 | + v4sf src_tmp2 = vec_ld(0, src + i + ALTIVEC_LEN_FLOAT); |
| 3958 | + v4sf src_tmp3 = vec_ld(0, src + i + 2 * ALTIVEC_LEN_FLOAT); |
| 3959 | + v4sf src_tmp4 = vec_ld(0, src + i + 3 * ALTIVEC_LEN_FLOAT); |
| 3960 | + v4sf tmp1 = vec_mul(src_tmp1, scale_fact_vec); |
| 3961 | + v4sf tmp2 = vec_mul(src_tmp2, scale_fact_vec); |
| 3962 | + v4sf tmp3 = vec_mul(src_tmp3, scale_fact_vec); |
| 3963 | + v4sf tmp4 = vec_mul(src_tmp4, scale_fact_vec); |
| 3964 | + v4si tmp1_int = vec_cts(tmp1, 0); |
| 3965 | + v4si tmp2_int = vec_cts(tmp2, 0); |
| 3966 | + v4si tmp3_int = vec_cts(tmp3, 0); |
| 3967 | + v4si tmp4_int = vec_cts(tmp4, 0); |
| 3968 | + v8ss tmp5 = vec_packs(tmp1_int, tmp2_int); |
| 3969 | + v8ss tmp6 = vec_packs(tmp3_int, tmp4_int); |
| 3970 | + vec_st(tmp5, 0, dst + i); |
| 3971 | + vec_st(tmp6, 0, dst + i + ALTIVEC_LEN_INT16); |
| 3972 | + } |
| 3973 | + } else { |
| 3974 | + //TODO |
| 3975 | + } |
| 3976 | + |
| 3977 | + if (rounding_mode == RndFinancial) { |
| 3978 | + for (int i = stop_len; i < len; i++) { |
| 3979 | + float tmp = (roundf(src[i] * scale_fact_mult * 0.5f) / 2.0f); |
| 3980 | + dst[i] = (int16_t) (tmp > 32767.0f ? 32767.0f : tmp); // round to nearest even with round(x/2)*2 |
| 3981 | + } |
| 3982 | + } else { |
| 3983 | + // Default round toward zero |
| 3984 | + for (int i = stop_len; i < len; i++) { |
| 3985 | + float tmp = nearbyintf(src[i] * scale_fact_mult); |
| 3986 | + dst[i] = (int16_t) (tmp > 32767.0f ? 32767.0f : tmp); |
| 3987 | + } |
| 3988 | + _FPU_SETCW(_mm_rounding_ori); // restore previous rounding mode |
| 3989 | + fesetround(rounding_ori); |
| 3990 | + } |
| 3991 | +} |
| 3992 | + |
| 3993 | +static inline void convertFloat32ToU16_128(float *src, uint16_t *dst, int len, int rounding_mode, int scale_factor) |
| 3994 | +{ |
| 3995 | + int stop_len = len / (4 * ALTIVEC_LEN_FLOAT); |
| 3996 | + stop_len *= (4 * ALTIVEC_LEN_FLOAT); |
| 3997 | + |
| 3998 | + float scale_fact_mult = 1.0f / (float) (1 << scale_factor); |
| 3999 | + v4sf scale_fact_vec = vec_splats(scale_fact_mult); |
| 4000 | + |
| 4001 | + fpu_control_t _mm_rounding_ori; // save rounding mode |
| 4002 | + _FPU_GETCW(_mm_rounding_ori); |
| 4003 | + |
| 4004 | + int rounding_ori = fegetround(); |
| 4005 | + |
| 4006 | + if (rounding_mode == RndZero) { |
| 4007 | + _FPU_SETCW(_FPU_RC_ZERO | _FPU_DEFAULT); // rounding_vec = ROUNDTOZERO; |
| 4008 | + fesetround(FE_TOWARDZERO); |
| 4009 | + } else if (rounding_mode == RndFinancial) { // nothing to do, Default bankers rounding => round to nearest even |
| 4010 | + } else { |
| 4011 | + _FPU_SETCW(_FPU_RC_NEAREST | _FPU_DEFAULT); // rounding_vec = ROUNDTONEAREST; |
| 4012 | + fesetround(FE_TONEAREST); |
| 4013 | + } |
| 4014 | + |
| 4015 | + if (areAligned2((uintptr_t) (src), (uintptr_t) (dst), ALTIVEC_LEN_BYTES)) { |
| 4016 | + for (int i = 0; i < stop_len; i += 4 * ALTIVEC_LEN_FLOAT) { |
| 4017 | + v4sf src_tmp1 = vec_ld(0, src + i); |
| 4018 | + v4sf src_tmp2 = vec_ld(0, src + i + ALTIVEC_LEN_FLOAT); |
| 4019 | + v4sf src_tmp3 = vec_ld(0, src + i + 2 * ALTIVEC_LEN_FLOAT); |
| 4020 | + v4sf src_tmp4 = vec_ld(0, src + i + 3 * ALTIVEC_LEN_FLOAT); |
| 4021 | + v4sf tmp1 = vec_mul(src_tmp1, scale_fact_vec); |
| 4022 | + v4sf tmp2 = vec_mul(src_tmp2, scale_fact_vec); |
| 4023 | + v4sf tmp3 = vec_mul(src_tmp3, scale_fact_vec); |
| 4024 | + v4sf tmp4 = vec_mul(src_tmp4, scale_fact_vec); |
| 4025 | + v4si tmp1_int = vec_cts(tmp1, 0); |
| 4026 | + v4si tmp2_int = vec_cts(tmp2, 0); |
| 4027 | + v4si tmp3_int = vec_cts(tmp3, 0); |
| 4028 | + v4si tmp4_int = vec_cts(tmp4, 0); |
| 4029 | + v8us tmp5 = vec_packsu(tmp1_int, tmp2_int); |
| 4030 | + v8us tmp6 = vec_packsu(tmp3_int, tmp4_int); |
| 4031 | + vec_st(tmp5, 0, dst + i); |
| 4032 | + vec_st(tmp6, 0, dst + i + ALTIVEC_LEN_INT16); |
| 4033 | + } |
| 4034 | + } else { |
| 4035 | + //TODO |
| 4036 | + } |
| 4037 | + |
| 4038 | + if (rounding_mode == RndFinancial) { |
| 4039 | + for (int i = stop_len; i < len; i++) { |
| 4040 | + float tmp = (roundf(src[i] * scale_fact_mult * 0.5f) / 2.0f); |
| 4041 | + dst[i] = (uint16_t) (tmp > 65535.0f ? 65535.0f : tmp); // round to nearest even with round(x/2)*2 |
| 4042 | + } |
| 4043 | + } else { |
| 4044 | + // Default round toward zero |
| 4045 | + for (int i = stop_len; i < len; i++) { |
| 4046 | + float tmp = nearbyintf(src[i] * scale_fact_mult); |
| 4047 | + dst[i] = (uint16_t) (tmp > 65535.0f ? 65535.0f : tmp); // round to nearest even with round(x/2)*2 |
| 4048 | + } |
| 4049 | + _FPU_SETCW(_mm_rounding_ori); // restore previous rounding mode |
| 4050 | + fesetround(rounding_ori); |
| 4051 | + } |
| 4052 | +} |
| 4053 | + |
| 4054 | +/* |
| 4055 | +static inline void convertInt16ToFloat32_128(int16_t *src, float *dst, int len, int scale_factor) |
| 4056 | +{ |
| 4057 | + int stop_len = len / (2 * ALTIVEC_LEN_FLOAT); |
| 4058 | + stop_len *= (2 * ALTIVEC_LEN_FLOAT); |
| 4059 | +
|
| 4060 | + float scale_fact_mult = 1.0f / (float) (1 << scale_factor); |
| 4061 | + v4sf scale_fact_vec = vec_splats(scale_fact_mult); |
| 4062 | +
|
| 4063 | + if (areAligned2((uintptr_t) (src), (uintptr_t) (dst), ALTIVEC_LEN_BYTES)) { |
| 4064 | + for (int i = 0; i < stop_len; i += 2 * ALTIVEC_LEN_FLOAT) { |
| 4065 | + v8ss vec = vec_ld(0, src + i); // loads 1 2 3 4 5 6 7 8 8 |
| 4066 | + v8ss low = vec_mergeh(vec, vec); // low 1 1 2 2 3 3 4 4 |
| 4067 | + v8ss high = vec_mergel(vec, vec); // high 5 5 6 6 7 7 8 8 |
| 4068 | + v4ui shift = vec_splats((unsigned int)16); |
| 4069 | + v16u8 lowu = vec_sra(*(v16u8*)&low, *(v16u8*)&shift); // make low 1 -1 2 -1 3 -1 4 -4 |
| 4070 | + v16u8 highu = vec_sra(*(v16u8*)&high, *(v16u8*)&shift); // make high 5 -1 6 -1 7 -1 8 -1 |
| 4071 | + v4sf lowf = vec_ctf(*(v4si*)&lowu, 0); |
| 4072 | + v4sf highf = vec_ctf(*(v4si*)&highu, 0); |
| 4073 | + |
| 4074 | + // convert the vector to float and scale it |
| 4075 | + v4sf floatlo = vec_mul(lowf, scale_fact_vec); |
| 4076 | + v4sf floathi = vec_mul(highf, scale_fact_vec); |
| 4077 | +
|
| 4078 | + vec_st(floatlo, 0, dst + i); |
| 4079 | + vec_st(floathi, 0, dst + i + ALTIVEC_LEN_FLOAT); |
| 4080 | + } |
| 4081 | + } else { |
| 4082 | + //TODO |
| 4083 | + } |
| 4084 | +
|
| 4085 | + for (int i = stop_len; i < len; i++) { |
| 4086 | + dst[i] = (float) src[i] * scale_fact_mult; |
| 4087 | + } |
| 4088 | +} |
| 4089 | +*/ |
| 4090 | +#endif |
0 commit comments