1
1
/*
2
2
* Project : SIMD_Utils
3
- * Version : 0.2.1
3
+ * Version : 0.2.2
4
4
* Author : JishinMaster
5
5
* Licence : BSD-2
6
6
*/
@@ -92,44 +92,42 @@ static inline void simd_utils_get_version(void)
92
92
93
93
#ifdef SSE
94
94
95
- // For X86 devices with only SSE2
96
95
#ifdef NO_SSE3
97
- static inline __m128 _mm_movehdup_ps (__m128 __X )
96
+ static inline __m128 _mm_movehdup_ps (__m128 __X )
98
97
{
99
- return _mm_shuffle_ps (__X , __X , 0xF5 );
98
+ return _mm_shuffle_ps (__X , __X , 0xF5 );
100
99
}
101
100
102
- static inline __m128 _mm_moveldup_ps (__m128 __X )
101
+ static inline __m128 _mm_moveldup_ps (__m128 __X )
103
102
{
104
- return _mm_shuffle_ps (__X , __X , 0xA0 );
103
+ return _mm_shuffle_ps (__X , __X , 0xA0 );
105
104
}
106
105
#endif
107
106
108
- // For X86 devices with only SSE2 and SSE3 (before CoreI7)
109
107
#ifdef NO_SSE4
110
108
static inline __m128i _mm_cmpeq_epi64 (__m128i __X , __m128i __Y )
111
109
{
112
- int64_t * ptr_x = (int64_t * ) & __X ;
113
- int64_t * ptr_y = (int64_t * ) & __Y ;
110
+ int64_t * ptr_x = (int64_t * ) & __X ;
111
+ int64_t * ptr_y = (int64_t * ) & __Y ;
114
112
__m128i ret ;
115
- int64_t * ptr_ret = (int64_t * ) & ret ;
116
-
117
- ptr_ret [0 ] = (ptr_x [0 ] == ptr_y [0 ])? 0xFFFFFFFFFFFFFFFF : 0 ;
118
- ptr_ret [1 ] = (ptr_x [1 ] == ptr_y [1 ])? 0xFFFFFFFFFFFFFFFF : 0 ;
113
+ int64_t * ptr_ret = (int64_t * ) & ret ;
114
+
115
+ ptr_ret [0 ] = (ptr_x [0 ] == ptr_y [0 ]) ? 0xFFFFFFFFFFFFFFFF : 0 ;
116
+ ptr_ret [1 ] = (ptr_x [1 ] == ptr_y [1 ]) ? 0xFFFFFFFFFFFFFFFF : 0 ;
119
117
return ret ;
120
118
}
121
119
122
120
static inline __m128d _mm_blendv_pd (__m128d __X , __m128d __Y , __m128d __M )
123
121
{
124
122
__m128d b_tmp = _mm_and_pd (__Y , __M );
125
- __m128d a_tmp = _mm_and_pd (__X , _mm_cmpeq_pd (__M ,* (__m128d * )_pd_zero ));
123
+ __m128d a_tmp = _mm_and_pd (__X , _mm_cmpeq_pd (__M , * (__m128d * ) _pd_zero ));
126
124
return _mm_or_pd (a_tmp , b_tmp );
127
125
}
128
126
129
127
static inline __m128 _mm_blendv_ps (__m128 __X , __m128 __Y , __m128 __M )
130
128
{
131
129
__m128 b_tmp = _mm_and_ps (__Y , __M );
132
- __m128 a_tmp = _mm_and_ps (__X , _mm_cmpeq_ps (__M ,* (__m128 * )_ps_zero ));
130
+ __m128 a_tmp = _mm_and_ps (__X , _mm_cmpeq_ps (__M , * (__m128 * ) _ps_zero ));
133
131
return _mm_or_ps (a_tmp , b_tmp );
134
132
}
135
133
@@ -140,34 +138,41 @@ static inline __m128i _mm_stream_load_si128(__m128i *__X)
140
138
141
139
static inline __m128 _mm_round_ps (__m128 X , int mode )
142
140
{
143
- __m128 ret ;
144
- __m128i reti ;
145
- unsigned int old_mode = _MM_GET_ROUNDING_MODE ();
146
- switch (mode ){
141
+ __m128 ret ;
142
+ __m128i reti ;
143
+ unsigned int old_mode = _MM_GET_ROUNDING_MODE ();
144
+ switch (mode ) {
147
145
case _MM_FROUND_TRUNC :
148
146
case _MM_ROUND_TOWARD_ZERO :
149
147
case ROUNDTOZERO :
150
148
_MM_SET_ROUNDING_MODE (_MM_ROUND_TOWARD_ZERO );
151
- break ;
149
+ break ;
152
150
case ROUNDTOCEIL :
153
151
case _MM_ROUND_UP :
154
152
_MM_SET_ROUNDING_MODE (_MM_ROUND_UP );
155
- break ;
153
+ break ;
156
154
case ROUNDTOFLOOR :
157
155
case _MM_ROUND_DOWN :
158
156
_MM_SET_ROUNDING_MODE (_MM_ROUND_DOWN );
159
157
break ;
160
158
default :
161
- _MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST );
159
+ // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
162
160
break ;
161
+ }
162
+ reti = _mm_cvtps_epi32 (X );
163
+ ret = _mm_cvtepi32_ps (reti );
164
+ _MM_SET_ROUNDING_MODE (old_mode );
165
+ return ret ;
163
166
}
164
- reti = _mm_cvtps_epi32 (X );
165
- ret = _mm_cvtepi32_ps (reti );
166
- _MM_SET_ROUNDING_MODE (old_mode );
167
- return ret ;
167
+
168
+ /* not accurate but might do the trick for most cases
169
+ where the full range is not needed */
170
+ static inline __m128i _mm_packus_epi32 (__m128i a , __m128i b )
171
+ {
172
+ return _mm_packs_epi32 (a , b );
168
173
}
169
174
#endif
170
-
175
+
171
176
#ifndef ARM
172
177
#include "sse_mathfun.h"
173
178
#else /* ARM */
@@ -410,6 +415,42 @@ static inline __m512d _mm512_fnmadd_pd_custom(__m512d a, __m512d b, __m512d c)
410
415
411
416
#include "avx512_mathfun.h"
412
417
418
+ static inline v16sfx2 _mm512_load2_ps (float const * mem_addr )
419
+ {
420
+ v16sf vec1 = _mm512_load_ps (mem_addr ); // load 0 1 2 3 4 5 6 7
421
+ v16sf vec2 = _mm512_load_ps (mem_addr + AVX512_LEN_FLOAT ); // load 8 9 10 11 12 13 14 15
422
+ v16sfx2 ret ;
423
+ ret .val [0 ] = _mm512_permutex2var_ps (vec2 , * (v16si * ) _pi32_512_idx_re , vec1 );
424
+ ret .val [1 ] = _mm512_permutex2var_ps (vec2 , * (v16si * ) _pi32_512_idx_im , vec1 );
425
+ return ret ;
426
+ }
427
+
428
+ static inline v16sfx2 _mm512_load2u_ps (float const * mem_addr )
429
+ {
430
+ v16sf vec1 = _mm512_loadu_ps (mem_addr ); // load 0 1 2 3 4 5 6 7
431
+ v16sf vec2 = _mm512_loadu_ps (mem_addr + AVX512_LEN_FLOAT ); // load 8 9 10 11 12 13 14 15
432
+ v16sfx2 ret ;
433
+ ret .val [0 ] = _mm512_permutex2var_ps (vec2 , * (v16si * ) _pi32_512_idx_re , vec1 );
434
+ ret .val [1 ] = _mm512_permutex2var_ps (vec2 , * (v16si * ) _pi32_512_idx_im , vec1 );
435
+ return ret ;
436
+ }
437
+
438
+ static inline void _mm512_store2_ps (float * mem_addr , v16sfx2 a )
439
+ {
440
+ v16sf tmp1 = _mm512_permutex2var_ps (a .val [1 ], * (v16si * ) _pi32_512_idx_cplx_lo , a .val [0 ]);
441
+ v16sf tmp2 = _mm512_permutex2var_ps (a .val [1 ], * (v16si * ) _pi32_512_idx_cplx_hi , a .val [0 ]);
442
+ _mm512_store_ps (mem_addr , tmp1 );
443
+ _mm512_store_ps (mem_addr + AVX512_LEN_FLOAT , tmp2 );
444
+ }
445
+
446
+ static inline void _mm512_store2u_ps (float * mem_addr , v16sfx2 a )
447
+ {
448
+ v16sf tmp1 = _mm512_permutex2var_ps (a .val [1 ], * (v16si * ) _pi32_512_idx_cplx_lo , a .val [0 ]);
449
+ v16sf tmp2 = _mm512_permutex2var_ps (a .val [1 ], * (v16si * ) _pi32_512_idx_cplx_hi , a .val [0 ]);
450
+ _mm512_storeu_ps (mem_addr , tmp1 );
451
+ _mm512_storeu_ps (mem_addr + AVX512_LEN_FLOAT , tmp2 );
452
+ }
453
+
413
454
#include "simd_utils_avx512_double.h"
414
455
#include "simd_utils_avx512_float.h"
415
456
#include "simd_utils_avx512_int32.h"
@@ -748,7 +789,38 @@ static inline void convertFloat32ToI16_C(float *src, int16_t *dst, int len, int
748
789
#endif
749
790
for (int i = 0 ; i < len ; i ++ ) {
750
791
float tmp = nearbyintf (src [i ] * scale_fact_mult );
751
- dst [i ] = (uint16_t ) (tmp > 32767.0f ? 32767.0f : tmp );
792
+ dst [i ] = (int16_t ) (tmp > 32767.0f ? 32767.0f : tmp );
793
+ }
794
+ }
795
+ }
796
+
797
+ static inline void convertFloat32ToU16_C (float * src , uint16_t * dst , int len , int rounding_mode , int scale_factor )
798
+ {
799
+ float scale_fact_mult = 1.0f / (float ) (1 << scale_factor );
800
+
801
+ // Default bankers rounding => round to nearest even
802
+ if (rounding_mode == RndFinancial ) {
803
+ #ifdef OMP
804
+ #pragma omp simd
805
+ #endif
806
+ for (int i = 0 ; i < len ; i ++ ) {
807
+ float tmp = (roundf (src [i ] * scale_fact_mult * 0.5f ) / 2.0f );
808
+ dst [i ] = (uint16_t ) (tmp > 65535.0f ? 65535.0f : tmp ); // round to nearest even with round(x/2)*2
809
+ }
810
+ } else {
811
+ if (rounding_mode == RndZero ) {
812
+ fesetround (FE_TOWARDZERO );
813
+ } else {
814
+ fesetround (FE_TONEAREST );
815
+ }
816
+
817
+ // Default round toward zero
818
+ #ifdef OMP
819
+ #pragma omp simd
820
+ #endif
821
+ for (int i = 0 ; i < len ; i ++ ) {
822
+ float tmp = nearbyintf (src [i ] * scale_fact_mult );
823
+ dst [i ] = (uint16_t ) (tmp > 65535.0f ? 65535.0f : tmp );
752
824
}
753
825
}
754
826
}
0 commit comments