Skip to content

Commit 10766f1

Browse files
committed
completed AVX512 port! Upgraded to version 0.2.2
1 parent 554f76f commit 10766f1

21 files changed

+1877
-273
lines changed

README.md

+108-108
Large diffs are not rendered by default.

avx512_mathfun.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Project : SIMD_Utils
3-
* Version : 0.2.1
3+
* Version : 0.2.2
44
* Author : JishinMaster
55
* Licence : BSD-2
66
*/

mysincosf.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Project : SIMD_Utils
3-
* Version : 0.2.1
3+
* Version : 0.2.2
44
* Author : JishinMaster
55
* Licence : BSD-2
66
*/

simd_test.c

+293-6
Large diffs are not rendered by default.

simd_test_opencl.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Project : SIMD_Utils
3-
* Version : 0.2.1
3+
* Version : 0.2.2
44
* Author : JishinMaster
55
* Licence : BSD-2
66
*/

simd_utils.h

+100-28
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Project : SIMD_Utils
3-
* Version : 0.2.1
3+
* Version : 0.2.2
44
* Author : JishinMaster
55
* Licence : BSD-2
66
*/
@@ -92,44 +92,42 @@ static inline void simd_utils_get_version(void)
9292

9393
#ifdef SSE
9494

95-
// For X86 devices with only SSE2
9695
#ifdef NO_SSE3
97-
static inline __m128 _mm_movehdup_ps (__m128 __X)
96+
static inline __m128 _mm_movehdup_ps(__m128 __X)
9897
{
99-
return _mm_shuffle_ps (__X, __X, 0xF5);
98+
return _mm_shuffle_ps(__X, __X, 0xF5);
10099
}
101100

102-
static inline __m128 _mm_moveldup_ps (__m128 __X)
101+
static inline __m128 _mm_moveldup_ps(__m128 __X)
103102
{
104-
return _mm_shuffle_ps (__X, __X, 0xA0);
103+
return _mm_shuffle_ps(__X, __X, 0xA0);
105104
}
106105
#endif
107106

108-
// For X86 devices with only SSE2 and SSE3 (before CoreI7)
109107
#ifdef NO_SSE4
110108
static inline __m128i _mm_cmpeq_epi64(__m128i __X, __m128i __Y)
111109
{
112-
int64_t* ptr_x = (int64_t*)&__X;
113-
int64_t* ptr_y = (int64_t*)&__Y;
110+
int64_t *ptr_x = (int64_t *) &__X;
111+
int64_t *ptr_y = (int64_t *) &__Y;
114112
__m128i ret;
115-
int64_t* ptr_ret = (int64_t*)&ret;
116-
117-
ptr_ret[0] = (ptr_x[0] == ptr_y[0])? 0xFFFFFFFFFFFFFFFF : 0;
118-
ptr_ret[1] = (ptr_x[1] == ptr_y[1])? 0xFFFFFFFFFFFFFFFF : 0;
113+
int64_t *ptr_ret = (int64_t *) &ret;
114+
115+
ptr_ret[0] = (ptr_x[0] == ptr_y[0]) ? 0xFFFFFFFFFFFFFFFF : 0;
116+
ptr_ret[1] = (ptr_x[1] == ptr_y[1]) ? 0xFFFFFFFFFFFFFFFF : 0;
119117
return ret;
120118
}
121119

122120
static inline __m128d _mm_blendv_pd(__m128d __X, __m128d __Y, __m128d __M)
123121
{
124122
__m128d b_tmp = _mm_and_pd(__Y, __M);
125-
__m128d a_tmp = _mm_and_pd(__X, _mm_cmpeq_pd(__M,*(__m128d *)_pd_zero));
123+
__m128d a_tmp = _mm_and_pd(__X, _mm_cmpeq_pd(__M, *(__m128d *) _pd_zero));
126124
return _mm_or_pd(a_tmp, b_tmp);
127125
}
128126

129127
static inline __m128 _mm_blendv_ps(__m128 __X, __m128 __Y, __m128 __M)
130128
{
131129
__m128 b_tmp = _mm_and_ps(__Y, __M);
132-
__m128 a_tmp = _mm_and_ps(__X, _mm_cmpeq_ps(__M,*(__m128 *)_ps_zero));
130+
__m128 a_tmp = _mm_and_ps(__X, _mm_cmpeq_ps(__M, *(__m128 *) _ps_zero));
133131
return _mm_or_ps(a_tmp, b_tmp);
134132
}
135133

@@ -140,34 +138,41 @@ static inline __m128i _mm_stream_load_si128(__m128i *__X)
140138

141139
static inline __m128 _mm_round_ps(__m128 X, int mode)
142140
{
143-
__m128 ret;
144-
__m128i reti;
145-
unsigned int old_mode = _MM_GET_ROUNDING_MODE();
146-
switch(mode){
141+
__m128 ret;
142+
__m128i reti;
143+
unsigned int old_mode = _MM_GET_ROUNDING_MODE();
144+
switch (mode) {
147145
case _MM_FROUND_TRUNC:
148146
case _MM_ROUND_TOWARD_ZERO:
149147
case ROUNDTOZERO:
150148
_MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
151-
break;
149+
break;
152150
case ROUNDTOCEIL:
153151
case _MM_ROUND_UP:
154152
_MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
155-
break;
153+
break;
156154
case ROUNDTOFLOOR:
157155
case _MM_ROUND_DOWN:
158156
_MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
159157
break;
160158
default:
161-
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
159+
//_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
162160
break;
161+
}
162+
reti = _mm_cvtps_epi32(X);
163+
ret = _mm_cvtepi32_ps(reti);
164+
_MM_SET_ROUNDING_MODE(old_mode);
165+
return ret;
163166
}
164-
reti = _mm_cvtps_epi32(X);
165-
ret = _mm_cvtepi32_ps(reti);
166-
_MM_SET_ROUNDING_MODE(old_mode);
167-
return ret;
167+
168+
/* not accurate but might do the trick for most cases
169+
where the full range is not needed */
170+
static inline __m128i _mm_packus_epi32(__m128i a, __m128i b)
171+
{
172+
return _mm_packs_epi32(a, b);
168173
}
169174
#endif
170-
175+
171176
#ifndef ARM
172177
#include "sse_mathfun.h"
173178
#else /* ARM */
@@ -410,6 +415,42 @@ static inline __m512d _mm512_fnmadd_pd_custom(__m512d a, __m512d b, __m512d c)
410415

411416
#include "avx512_mathfun.h"
412417

418+
static inline v16sfx2 _mm512_load2_ps(float const *mem_addr)
419+
{
420+
v16sf vec1 = _mm512_load_ps(mem_addr); // load 0 1 2 3 4 5 6 7
421+
v16sf vec2 = _mm512_load_ps(mem_addr + AVX512_LEN_FLOAT); // load 8 9 10 11 12 13 14 15
422+
v16sfx2 ret;
423+
ret.val[0] = _mm512_permutex2var_ps(vec2, *(v16si *) _pi32_512_idx_re, vec1);
424+
ret.val[1] = _mm512_permutex2var_ps(vec2, *(v16si *) _pi32_512_idx_im, vec1);
425+
return ret;
426+
}
427+
428+
static inline v16sfx2 _mm512_load2u_ps(float const *mem_addr)
429+
{
430+
v16sf vec1 = _mm512_loadu_ps(mem_addr); // load 0 1 2 3 4 5 6 7
431+
v16sf vec2 = _mm512_loadu_ps(mem_addr + AVX512_LEN_FLOAT); // load 8 9 10 11 12 13 14 15
432+
v16sfx2 ret;
433+
ret.val[0] = _mm512_permutex2var_ps(vec2, *(v16si *) _pi32_512_idx_re, vec1);
434+
ret.val[1] = _mm512_permutex2var_ps(vec2, *(v16si *) _pi32_512_idx_im, vec1);
435+
return ret;
436+
}
437+
438+
static inline void _mm512_store2_ps(float *mem_addr, v16sfx2 a)
439+
{
440+
v16sf tmp1 = _mm512_permutex2var_ps(a.val[1], *(v16si *) _pi32_512_idx_cplx_lo, a.val[0]);
441+
v16sf tmp2 = _mm512_permutex2var_ps(a.val[1], *(v16si *) _pi32_512_idx_cplx_hi, a.val[0]);
442+
_mm512_store_ps(mem_addr, tmp1);
443+
_mm512_store_ps(mem_addr + AVX512_LEN_FLOAT, tmp2);
444+
}
445+
446+
static inline void _mm512_store2u_ps(float *mem_addr, v16sfx2 a)
447+
{
448+
v16sf tmp1 = _mm512_permutex2var_ps(a.val[1], *(v16si *) _pi32_512_idx_cplx_lo, a.val[0]);
449+
v16sf tmp2 = _mm512_permutex2var_ps(a.val[1], *(v16si *) _pi32_512_idx_cplx_hi, a.val[0]);
450+
_mm512_storeu_ps(mem_addr, tmp1);
451+
_mm512_storeu_ps(mem_addr + AVX512_LEN_FLOAT, tmp2);
452+
}
453+
413454
#include "simd_utils_avx512_double.h"
414455
#include "simd_utils_avx512_float.h"
415456
#include "simd_utils_avx512_int32.h"
@@ -748,7 +789,38 @@ static inline void convertFloat32ToI16_C(float *src, int16_t *dst, int len, int
748789
#endif
749790
for (int i = 0; i < len; i++) {
750791
float tmp = nearbyintf(src[i] * scale_fact_mult);
751-
dst[i] = (uint16_t) (tmp > 32767.0f ? 32767.0f : tmp);
792+
dst[i] = (int16_t) (tmp > 32767.0f ? 32767.0f : tmp);
793+
}
794+
}
795+
}
796+
797+
static inline void convertFloat32ToU16_C(float *src, uint16_t *dst, int len, int rounding_mode, int scale_factor)
798+
{
799+
float scale_fact_mult = 1.0f / (float) (1 << scale_factor);
800+
801+
// Default bankers rounding => round to nearest even
802+
if (rounding_mode == RndFinancial) {
803+
#ifdef OMP
804+
#pragma omp simd
805+
#endif
806+
for (int i = 0; i < len; i++) {
807+
float tmp = (roundf(src[i] * scale_fact_mult * 0.5f) / 2.0f);
808+
dst[i] = (uint16_t) (tmp > 65535.0f ? 65535.0f : tmp); // round to nearest even with round(x/2)*2
809+
}
810+
} else {
811+
if (rounding_mode == RndZero) {
812+
fesetround(FE_TOWARDZERO);
813+
} else {
814+
fesetround(FE_TONEAREST);
815+
}
816+
817+
// Default round toward zero
818+
#ifdef OMP
819+
#pragma omp simd
820+
#endif
821+
for (int i = 0; i < len; i++) {
822+
float tmp = nearbyintf(src[i] * scale_fact_mult);
823+
dst[i] = (uint16_t) (tmp > 65535.0f ? 65535.0f : tmp);
752824
}
753825
}
754826
}

simd_utils_altivec_float.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Project : SIMD_Utils
3-
* Version : 0.2.1
3+
* Version : 0.2.2
44
* Author : JishinMaster
55
* Licence : BSD-2
66
*/

simd_utils_avx512_double.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Project : SIMD_Utils
3-
* Version : 0.2.1
3+
* Version : 0.2.2
44
* Author : JishinMaster
55
* Licence : BSD-2
66
*/

0 commit comments

Comments
 (0)