@@ -3910,7 +3910,41 @@ static inline void convertFloat32ToU8_128(float *src, uint8_t *dst, int len, int
3910
3910
vec_st (tmp7 , 0 , dst + i );
3911
3911
}
3912
3912
} else {
3913
- //TODO
3913
+ int unalign_src = (uintptr_t ) (src ) % ALTIVEC_LEN_BYTES ;
3914
+ int unalign_dst = (uintptr_t ) (dst ) % ALTIVEC_LEN_BYTES ;
3915
+
3916
+ for (int i = 0 ; i < stop_len ; i += 4 * ALTIVEC_LEN_FLOAT ) {
3917
+ v4sf src_tmp1 , src_tmp2 , src_tmp3 , src_tmp4 ;
3918
+ if (unalign_src ) {
3919
+ src_tmp1 = (v4sf ) vec_ldu ((unsigned char * ) (src + i ));
3920
+ src_tmp2 = (v4sf ) vec_ldu ((unsigned char * ) (src + i + ALTIVEC_LEN_FLOAT ));
3921
+ src_tmp3 = (v4sf ) vec_ldu ((unsigned char * ) (src + i + 2 * ALTIVEC_LEN_FLOAT ));
3922
+ src_tmp4 = (v4sf ) vec_ldu ((unsigned char * ) (src + i + 3 * ALTIVEC_LEN_FLOAT ));
3923
+ } else {
3924
+ src_tmp1 = vec_ld (0 , src + i );
3925
+ src_tmp2 = vec_ld (0 , src + i + ALTIVEC_LEN_FLOAT );
3926
+ src_tmp3 = vec_ld (0 , src + i + 2 * ALTIVEC_LEN_FLOAT );
3927
+ src_tmp4 = vec_ld (0 , src + i + 3 * ALTIVEC_LEN_FLOAT );
3928
+ }
3929
+
3930
+ v4sf tmp1 = vec_mul (src_tmp1 , scale_fact_vec );
3931
+ v4sf tmp2 = vec_mul (src_tmp2 , scale_fact_vec );
3932
+ v4sf tmp3 = vec_mul (src_tmp3 , scale_fact_vec );
3933
+ v4sf tmp4 = vec_mul (src_tmp4 , scale_fact_vec );
3934
+ v4si tmp1_int = vec_cts (tmp1 , 0 );
3935
+ v4si tmp2_int = vec_cts (tmp2 , 0 );
3936
+ v4si tmp3_int = vec_cts (tmp3 , 0 );
3937
+ v4si tmp4_int = vec_cts (tmp4 , 0 );
3938
+ v8ss tmp5 = vec_packs (tmp1_int , tmp2_int );
3939
+ v8ss tmp6 = vec_packs (tmp3_int , tmp4_int );
3940
+ v16u8 tmp7 = vec_packsu (tmp5 , tmp6 );
3941
+
3942
+ if (unalign_dst ) {
3943
+ vec_stu (* (v16u8 * ) & tmp7 , (unsigned char * ) (dst + i ));
3944
+ } else {
3945
+ vec_st (tmp7 , 0 , dst + i );
3946
+ }
3947
+ }
3914
3948
}
3915
3949
3916
3950
if (rounding_mode == RndFinancial ) {
@@ -3971,7 +4005,42 @@ static inline void convertFloat32ToI16_128(float *src, int16_t *dst, int len, in
3971
4005
vec_st (tmp6 , 0 , dst + i + ALTIVEC_LEN_INT16 );
3972
4006
}
3973
4007
} else {
3974
- //TODO
4008
+ int unalign_src = (uintptr_t ) (src ) % ALTIVEC_LEN_BYTES ;
4009
+ int unalign_dst = (uintptr_t ) (dst ) % ALTIVEC_LEN_BYTES ;
4010
+
4011
+ for (int i = 0 ; i < stop_len ; i += 4 * ALTIVEC_LEN_FLOAT ) {
4012
+ v4sf src_tmp1 , src_tmp2 , src_tmp3 , src_tmp4 ;
4013
+ if (unalign_src ) {
4014
+ src_tmp1 = (v4sf ) vec_ldu ((unsigned char * ) (src + i ));
4015
+ src_tmp2 = (v4sf ) vec_ldu ((unsigned char * ) (src + i + ALTIVEC_LEN_FLOAT ));
4016
+ src_tmp3 = (v4sf ) vec_ldu ((unsigned char * ) (src + i + 2 * ALTIVEC_LEN_FLOAT ));
4017
+ src_tmp4 = (v4sf ) vec_ldu ((unsigned char * ) (src + i + 3 * ALTIVEC_LEN_FLOAT ));
4018
+ } else {
4019
+ src_tmp1 = vec_ld (0 , src + i );
4020
+ src_tmp2 = vec_ld (0 , src + i + ALTIVEC_LEN_FLOAT );
4021
+ src_tmp3 = vec_ld (0 , src + i + 2 * ALTIVEC_LEN_FLOAT );
4022
+ src_tmp4 = vec_ld (0 , src + i + 3 * ALTIVEC_LEN_FLOAT );
4023
+ }
4024
+
4025
+ v4sf tmp1 = vec_mul (src_tmp1 , scale_fact_vec );
4026
+ v4sf tmp2 = vec_mul (src_tmp2 , scale_fact_vec );
4027
+ v4sf tmp3 = vec_mul (src_tmp3 , scale_fact_vec );
4028
+ v4sf tmp4 = vec_mul (src_tmp4 , scale_fact_vec );
4029
+ v4si tmp1_int = vec_cts (tmp1 , 0 );
4030
+ v4si tmp2_int = vec_cts (tmp2 , 0 );
4031
+ v4si tmp3_int = vec_cts (tmp3 , 0 );
4032
+ v4si tmp4_int = vec_cts (tmp4 , 0 );
4033
+ v8ss tmp5 = vec_packs (tmp1_int , tmp2_int );
4034
+ v8ss tmp6 = vec_packs (tmp3_int , tmp4_int );
4035
+
4036
+ if (unalign_dst ) {
4037
+ vec_stu (* (v16u8 * ) & tmp5 , (unsigned char * ) (dst + i ));
4038
+ vec_stu (* (v16u8 * ) & tmp6 , (unsigned char * ) (dst + i + ALTIVEC_LEN_FLOAT ));
4039
+ } else {
4040
+ vec_st (tmp5 , 0 , dst + i );
4041
+ vec_st (tmp6 , 0 , dst + i + ALTIVEC_LEN_FLOAT );
4042
+ }
4043
+ }
3975
4044
}
3976
4045
3977
4046
if (rounding_mode == RndFinancial ) {
@@ -4032,7 +4101,42 @@ static inline void convertFloat32ToU16_128(float *src, uint16_t *dst, int len, i
4032
4101
vec_st (tmp6 , 0 , dst + i + ALTIVEC_LEN_INT16 );
4033
4102
}
4034
4103
} else {
4035
- //TODO
4104
+ int unalign_src = (uintptr_t ) (src ) % ALTIVEC_LEN_BYTES ;
4105
+ int unalign_dst = (uintptr_t ) (dst ) % ALTIVEC_LEN_BYTES ;
4106
+
4107
+ for (int i = 0 ; i < stop_len ; i += 4 * ALTIVEC_LEN_FLOAT ) {
4108
+ v4sf src_tmp1 , src_tmp2 , src_tmp3 , src_tmp4 ;
4109
+ if (unalign_src ) {
4110
+ src_tmp1 = (v4sf ) vec_ldu ((unsigned char * ) (src + i ));
4111
+ src_tmp2 = (v4sf ) vec_ldu ((unsigned char * ) (src + i + ALTIVEC_LEN_FLOAT ));
4112
+ src_tmp3 = (v4sf ) vec_ldu ((unsigned char * ) (src + i + 2 * ALTIVEC_LEN_FLOAT ));
4113
+ src_tmp4 = (v4sf ) vec_ldu ((unsigned char * ) (src + i + 3 * ALTIVEC_LEN_FLOAT ));
4114
+ } else {
4115
+ src_tmp1 = vec_ld (0 , src + i );
4116
+ src_tmp2 = vec_ld (0 , src + i + ALTIVEC_LEN_FLOAT );
4117
+ src_tmp3 = vec_ld (0 , src + i + 2 * ALTIVEC_LEN_FLOAT );
4118
+ src_tmp4 = vec_ld (0 , src + i + 3 * ALTIVEC_LEN_FLOAT );
4119
+ }
4120
+
4121
+ v4sf tmp1 = vec_mul (src_tmp1 , scale_fact_vec );
4122
+ v4sf tmp2 = vec_mul (src_tmp2 , scale_fact_vec );
4123
+ v4sf tmp3 = vec_mul (src_tmp3 , scale_fact_vec );
4124
+ v4sf tmp4 = vec_mul (src_tmp4 , scale_fact_vec );
4125
+ v4si tmp1_int = vec_cts (tmp1 , 0 );
4126
+ v4si tmp2_int = vec_cts (tmp2 , 0 );
4127
+ v4si tmp3_int = vec_cts (tmp3 , 0 );
4128
+ v4si tmp4_int = vec_cts (tmp4 , 0 );
4129
+ v8us tmp5 = vec_packsu (tmp1_int , tmp2_int );
4130
+ v8us tmp6 = vec_packsu (tmp3_int , tmp4_int );
4131
+
4132
+ if (unalign_dst ) {
4133
+ vec_stu (* (v16u8 * ) & tmp5 , (unsigned char * ) (dst + i ));
4134
+ vec_stu (* (v16u8 * ) & tmp6 , (unsigned char * ) (dst + i + ALTIVEC_LEN_FLOAT ));
4135
+ } else {
4136
+ vec_st (tmp5 , 0 , dst + i );
4137
+ vec_st (tmp6 , 0 , dst + i + ALTIVEC_LEN_FLOAT );
4138
+ }
4139
+ }
4036
4140
}
4037
4141
4038
4142
if (rounding_mode == RndFinancial ) {
@@ -4051,25 +4155,24 @@ static inline void convertFloat32ToU16_128(float *src, uint16_t *dst, int len, i
4051
4155
}
4052
4156
}
4053
4157
4054
- /*
4055
4158
static inline void convertInt16ToFloat32_128 (int16_t * src , float * dst , int len , int scale_factor )
4056
4159
{
4057
4160
int stop_len = len / (2 * ALTIVEC_LEN_FLOAT );
4058
4161
stop_len *= (2 * ALTIVEC_LEN_FLOAT );
4059
4162
4060
4163
float scale_fact_mult = 1.0f / (float ) (1 << scale_factor );
4061
4164
v4sf scale_fact_vec = vec_splats (scale_fact_mult );
4062
-
4165
+ v4ui shift = vec_splats ((unsigned int )16 );
4166
+
4063
4167
if (areAligned2 ((uintptr_t ) (src ), (uintptr_t ) (dst ), ALTIVEC_LEN_BYTES )) {
4064
4168
for (int i = 0 ; i < stop_len ; i += 2 * ALTIVEC_LEN_FLOAT ) {
4065
4169
v8ss vec = vec_ld (0 , src + i ); // loads 1 2 3 4 5 6 7 8 8
4066
4170
v8ss low = vec_mergeh (vec , vec ); // low 1 1 2 2 3 3 4 4
4067
4171
v8ss high = vec_mergel (vec , vec ); // high 5 5 6 6 7 7 8 8
4068
- v4ui shift = vec_splats((unsigned int)16);
4069
- v16u8 lowu = vec_sra(*(v16u8*)&low, *(v16u8*)&shift); // make low 1 -1 2 -1 3 -1 4 -4
4070
- v16u8 highu = vec_sra(*(v16u8*)&high, *(v16u8*)&shift); // make high 5 -1 6 -1 7 -1 8 -1
4071
- v4sf lowf = vec_ctf(*(v4si*)&lowu, 0);
4072
- v4sf highf = vec_ctf(*(v4si*)&highu, 0);
4172
+ v4si lows = vec_sra (* (v4si * )& low , shift ); // make low 1 -1 2 -1 3 -1 4 -4
4173
+ v4si highs = vec_sra (* (v4si * )& high , shift ); // make high 5 -1 6 -1 7 -1 8 -1
4174
+ v4sf lowf = vec_ctf (* (v4si * )& lows , 0 );
4175
+ v4sf highf = vec_ctf (* (v4si * )& highs , 0 );
4073
4176
4074
4177
// convert the vector to float and scale it
4075
4178
v4sf floatlo = vec_mul (lowf , scale_fact_vec );
@@ -4079,12 +4182,40 @@ static inline void convertInt16ToFloat32_128(int16_t *src, float *dst, int len,
4079
4182
vec_st (floathi , 0 , dst + i + ALTIVEC_LEN_FLOAT );
4080
4183
}
4081
4184
} else {
4082
- //TODO
4185
+ int unalign_src = (uintptr_t ) (src ) % ALTIVEC_LEN_BYTES ;
4186
+ int unalign_dst = (uintptr_t ) (dst ) % ALTIVEC_LEN_BYTES ;
4187
+
4188
+ for (int i = 0 ; i < stop_len ; i += 2 * ALTIVEC_LEN_FLOAT ) {
4189
+ v8ss vec ;
4190
+ if (unalign_src ) {
4191
+ vec = (v8ss ) vec_ldu ((unsigned char * ) (src + i ));
4192
+ } else {
4193
+ vec = vec_ld (0 , src + i );
4194
+ }
4195
+
4196
+ v8ss low = vec_mergeh (vec , vec ); // low 1 1 2 2 3 3 4 4
4197
+ v8ss high = vec_mergel (vec , vec ); // high 5 5 6 6 7 7 8 8
4198
+ v4si lows = vec_sra (* (v4si * )& low , shift ); // make low 1 -1 2 -1 3 -1 4 -4
4199
+ v4si highs = vec_sra (* (v4si * )& high , shift ); // make high 5 -1 6 -1 7 -1 8 -1
4200
+ v4sf lowf = vec_ctf (* (v4si * )& lows , 0 );
4201
+ v4sf highf = vec_ctf (* (v4si * )& highs , 0 );
4202
+
4203
+ // convert the vector to float and scale it
4204
+ v4sf floatlo = vec_mul (lowf , scale_fact_vec );
4205
+ v4sf floathi = vec_mul (highf , scale_fact_vec );
4206
+
4207
+ if (unalign_dst ) {
4208
+ vec_stu (* (v16u8 * ) & floatlo , (unsigned char * ) (dst + i ));
4209
+ vec_stu (* (v16u8 * ) & floathi , (unsigned char * ) (dst + i + ALTIVEC_LEN_FLOAT ));
4210
+ } else {
4211
+ vec_st (floatlo , 0 , dst + i );
4212
+ vec_st (floathi , 0 , dst + i + ALTIVEC_LEN_FLOAT );
4213
+ }
4214
+ }
4083
4215
}
4084
4216
4085
4217
for (int i = stop_len ; i < len ; i ++ ) {
4086
4218
dst [i ] = (float ) src [i ] * scale_fact_mult ;
4087
4219
}
4088
4220
}
4089
- */
4090
4221
#endif
0 commit comments