Skip to content

Commit f823830

Browse files
committed
Added atan2f_interleaved for SSE and AVX, and optimised some SSE interleaved functions
1 parent ebc3528 commit f823830

5 files changed

+298
-131
lines changed

simd_test.c

+55
Original file line numberDiff line numberDiff line change
@@ -5703,6 +5703,61 @@ for (int i = 0; i < len; i++){
57035703
#endif
57045704

57055705

5706+
printf("\n");
5707+
/////////////////////////////////////////////////////////// ATANF2_INTERLEAVED /////////////////////////////////////////////////////
5708+
printf("ATANF2_INTERLEAVED\n");
5709+
5710+
for (int i = 0; i < 2*len; i++) {
5711+
inout[i] = (float) (-1.0f * i + 0.15f) / 2.5f / (float) (5 * len);
5712+
inout_ref[i] = 50.0f;
5713+
inout2_ref[i] = 50.0f;
5714+
}
5715+
5716+
clock_gettime(CLOCK_REALTIME, &start);
5717+
atan2f_interleaved_C((complex32_t*)inout, inout_ref, len);
5718+
clock_gettime(CLOCK_REALTIME, &stop);
5719+
elapsed = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3;
5720+
printf("atan2f_interleaved_C %d %lf\n", len, elapsed);
5721+
5722+
clock_gettime(CLOCK_REALTIME, &start);
5723+
for (l = 0; l < loop; l++)
5724+
atan2f_interleaved_C((complex32_t*)inout, inout_ref, len);
5725+
clock_gettime(CLOCK_REALTIME, &stop);
5726+
elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
5727+
printf("atan2f_interleaved_C %d %lf\n", len, elapsed);
5728+
5729+
#ifdef SSE
5730+
clock_gettime(CLOCK_REALTIME, &start);
5731+
atan2128f_interleaved((complex32_t*)inout, inout2_ref, len);
5732+
clock_gettime(CLOCK_REALTIME, &stop);
5733+
elapsed = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3;
5734+
printf("atan2128f_interleaved %d %lf\n", len, elapsed);
5735+
5736+
clock_gettime(CLOCK_REALTIME, &start);
5737+
for (l = 0; l < loop; l++)
5738+
atan2128f_interleaved((complex32_t*)inout, inout2_ref, len);
5739+
clock_gettime(CLOCK_REALTIME, &stop);
5740+
elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
5741+
printf("atan2128f_interleaved %d %lf\n", len, elapsed);
5742+
l2_err(inout2_ref, inout_ref, len);
5743+
#endif
5744+
5745+
#ifdef AVX
5746+
clock_gettime(CLOCK_REALTIME, &start);
5747+
atan2256f_interleaved((complex32_t*)inout, inout2_ref, len);
5748+
clock_gettime(CLOCK_REALTIME, &stop);
5749+
elapsed = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3;
5750+
printf("atan2256f_interleaved %d %lf\n", len, elapsed);
5751+
5752+
clock_gettime(CLOCK_REALTIME, &start);
5753+
for (l = 0; l < loop; l++)
5754+
atan2256f_interleaved((complex32_t*)inout, inout2_ref, len);
5755+
clock_gettime(CLOCK_REALTIME, &stop);
5756+
elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
5757+
printf("atan2256f_interleaved %d %lf\n", len, elapsed);
5758+
l2_err(inout2_ref, inout_ref, len);
5759+
#endif
5760+
57065761
printf("\n");
57075762
/////////////////////////////////////////////////////////// ATAN //////////////////////////////////////////////////////////////////////////////
57085763
printf("ATAN\n");

simd_utils.h

+89
Original file line numberDiff line numberDiff line change
@@ -93,29 +93,43 @@ typedef enum {
9393
static inline int isAligned(uintptr_t ptr, size_t alignment)
9494
{
9595
#ifndef ALWAYS_ALIGNED
96+
97+
#ifndef ARM //ARM manages disalignment in hardware
9698
if (((uintptr_t)(ptr) % alignment) == 0)
9799
return 1;
98100
return 0;
99101
#else
100102
return 1;
101103
#endif
104+
105+
#else
106+
return 1;
107+
#endif
102108
}
103109

104110
static inline int areAligned2(uintptr_t ptr1, uintptr_t ptr2, size_t alignment)
105111
{
106112
#ifndef ALWAYS_ALIGNED
113+
114+
#ifndef ARM //ARM manages disalignment in hardware
107115
if (((uintptr_t)(ptr1) % alignment) == 0)
108116
if (((uintptr_t)(ptr2) % alignment) == 0)
109117
return 1;
110118
return 0;
111119
#else
112120
return 1;
113121
#endif
122+
123+
#else
124+
return 1;
125+
#endif
114126
}
115127

116128
static inline int areAligned3(uintptr_t ptr1, uintptr_t ptr2, uintptr_t ptr3, size_t alignment)
117129
{
118130
#ifndef ALWAYS_ALIGNED
131+
132+
#ifndef ARM //ARM manages disalignment in hardware
119133
if (((uintptr_t)(ptr1) % alignment) == 0)
120134
if (((uintptr_t)(ptr2) % alignment) == 0)
121135
if (((uintptr_t)(ptr3) % alignment) == 0)
@@ -124,6 +138,10 @@ static inline int areAligned3(uintptr_t ptr1, uintptr_t ptr2, uintptr_t ptr3, si
124138
#else
125139
return 1;
126140
#endif
141+
142+
#else
143+
return 1;
144+
#endif
127145
}
128146

129147

@@ -153,6 +171,66 @@ static inline void simd_utils_get_version(void)
153171

154172
#endif /* ARM */
155173

174+
#ifndef ARM
175+
typedef struct {
176+
v4sf val[2];
177+
} v4sfx2;
178+
#else
179+
typedef float32x4x2_t v4sfx2;
180+
#endif
181+
182+
static inline v4sfx2 _mm_load2_ps(float const *mem_addr)
183+
{
184+
#ifdef ARM
185+
return vld2q_f32(mem_addr);
186+
#else
187+
v4sf tmp1 = _mm_load_ps(mem_addr);
188+
v4sf tmp2 = _mm_load_ps(mem_addr + SSE_LEN_FLOAT);
189+
v4sfx2 ret;
190+
ret.val[0] = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(2, 0, 2, 0));
191+
ret.val[1] = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3, 1, 3, 1));
192+
return ret;
193+
#endif
194+
}
195+
196+
static inline v4sfx2 _mm_load2u_ps(float const *mem_addr)
197+
{
198+
#ifdef ARM
199+
return vld2q_f32(mem_addr);
200+
#else
201+
v4sf tmp1 = _mm_loadu_ps(mem_addr);
202+
v4sf tmp2 = _mm_loadu_ps(mem_addr + SSE_LEN_FLOAT);
203+
v4sfx2 ret;
204+
ret.val[0] = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(2, 0, 2, 0));
205+
ret.val[1] = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3, 1, 3, 1));
206+
return ret;
207+
#endif
208+
}
209+
210+
static inline void _mm_store2_ps(float *mem_addr, v4sfx2 a)
211+
{
212+
#ifdef ARM
213+
vst2q_f32(mem_addr, a);
214+
#else
215+
v4sf tmp1 = _mm_unpacklo_ps(a.val[0], a.val[1]);
216+
v4sf tmp2 = _mm_unpackhi_ps(a.val[0], a.val[1]);
217+
_mm_store_ps(mem_addr, tmp1);
218+
_mm_store_ps(mem_addr + SSE_LEN_FLOAT, tmp2);
219+
#endif
220+
}
221+
222+
static inline void _mm_store2u_ps(float *mem_addr, v4sfx2 a)
223+
{
224+
#ifdef ARM
225+
vst2q_f32(mem_addr, a);
226+
#else
227+
v4sf tmp1 = _mm_unpacklo_ps(a.val[0], a.val[1]);
228+
v4sf tmp2 = _mm_unpackhi_ps(a.val[0], a.val[1]);
229+
_mm_storeu_ps(mem_addr, tmp1);
230+
_mm_storeu_ps(mem_addr + SSE_LEN_FLOAT, tmp2);
231+
#endif
232+
}
233+
156234
//Warning, declared in reverse order since it's little endian :
157235
// const v4sf conj_mask = _mm_set_ps(-1.0f, 1.0f, -1.0f, 1.0f);
158236
static const float _ps_conj_mask[4] __attribute__((aligned(16))) = {1.0f, -1.0f, 1.0f, -1.0f};
@@ -1328,6 +1406,17 @@ static inline void atan2f_C(float *src1, float *src2, float *dst, int len)
13281406
}
13291407
}
13301408

1409+
static inline void atan2f_interleaved_C(complex32_t *src, float *dst, int len)
1410+
{
1411+
#ifdef OMP
1412+
#pragma omp simd
1413+
#endif
1414+
for (int i = 0; i < len; i++) {
1415+
dst[i] = atan2f(src[i].im, src[i].re);
1416+
}
1417+
}
1418+
1419+
13311420

13321421
static inline void sinf_C(float *src, float *dst, int len)
13331422
{

simd_utils_avx_double.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -563,7 +563,7 @@ static inline v4sd atan256_pd(v4sd xx)
563563
flag = _mm256_blendv_pd(flag, *(v4sd *) _pd256_1, suptan3pi8); // if( x > tan 3pi/8 ) then flag = 1
564564

565565
inftan3pi8inf0p66 = _mm256_and_pd(_mm256_cmp_pd(x, *(v4sd *) _pd256_TAN3PI8, _CMP_LE_OS), _mm256_cmp_pd(x, zerop66, _CMP_LE_OS)); // if( x <= tan 3pi/8 ) && (x <= 0.66)
566-
y = _mm256_blendv_pd(*(v4sd *) _pd_PIO4, y, inftan3pi8inf0p66); // y = 0 or PIO4
566+
y = _mm256_blendv_pd(*(v4sd *) _pd256_PIO4, y, inftan3pi8inf0p66); // y = 0 or PIO4
567567
x = _mm256_blendv_pd(_mm256_div_pd(_mm256_sub_pd(x, *(v4sd *) _pd256_1), _mm256_add_pd(x, *(v4sd *) _pd256_1)), x, inftan3pi8inf0p66);
568568
flag = _mm256_blendv_pd(flag, *(v4sd *) _pd256_2, _mm256_cmp_pd(*(v4sd *) _pd256_PIO4, y, _CMP_EQ_OS)); // if y = PIO4 then flag = 2
569569

simd_utils_avx_float.h

+47-1
Original file line numberDiff line numberDiff line change
@@ -627,7 +627,7 @@ static inline void print8(__m256 v)
627627
}
628628

629629
// converts 32bits complex float to two arrays real and im
630-
//Work in progress
630+
//Work in progress => could be improved with custom SSE mm_load2_ps
631631
static inline void cplxtoreal256f(float *src, float *dstRe, float *dstIm, int len)
632632
{
633633
int stop_len = 2 * len / (AVX_LEN_FLOAT);
@@ -1535,6 +1535,52 @@ static inline void atan2256f(float *src1, float *src2, float *dst, int len)
15351535
}
15361536
}
15371537

1538+
static inline void atan2256f_interleaved(complex32_t *src, float *dst, int len)
1539+
{
1540+
int stop_len = len / (2 * AVX_LEN_FLOAT);
1541+
stop_len *= 2 * AVX_LEN_FLOAT;
1542+
1543+
int j = 0;
1544+
if (areAligned2((uintptr_t)(src), (uintptr_t)(dst), AVX_LEN_BYTES)) {
1545+
for (int i = 0; i < stop_len; i += 2 * AVX_LEN_FLOAT) {
1546+
v4sfx2 src_1 = _mm_load2_ps((float *) (src) + j);
1547+
v4sfx2 src_2 = _mm_load2_ps((float *) (src) + j + 2 * SSE_LEN_FLOAT);
1548+
v4sfx2 src_3 = _mm_load2_ps((float *) (src) + j + 4 * SSE_LEN_FLOAT);
1549+
v4sfx2 src_4 = _mm_load2_ps((float *) (src) + j + 6 * SSE_LEN_FLOAT);
1550+
1551+
v8sf src_a_re = _mm256_set_m128(src_2.val[0], src_1.val[0]);
1552+
v8sf src_a_im = _mm256_set_m128(src_2.val[1], src_1.val[1]);
1553+
v8sf src_b_re = _mm256_set_m128(src_4.val[0], src_3.val[0]);
1554+
v8sf src_b_im = _mm256_set_m128(src_4.val[1], src_3.val[1]);
1555+
_mm256_store_ps(dst + i, atan2256f_ps(src_a_im, src_a_re));
1556+
_mm256_store_ps(dst + i + AVX_LEN_FLOAT, atan2256f_ps(src_b_im, src_b_re));
1557+
1558+
j += 4 * AVX_LEN_FLOAT;
1559+
}
1560+
} else {
1561+
for (int i = 0; i < stop_len; i += 2 * AVX_LEN_FLOAT) {
1562+
v4sfx2 src_1 = _mm_load2u_ps((float *) (src) + j);
1563+
v4sfx2 src_2 = _mm_load2u_ps((float *) (src) + j + 2 * SSE_LEN_FLOAT);
1564+
v4sfx2 src_3 = _mm_load2u_ps((float *) (src) + j + 4 * SSE_LEN_FLOAT);
1565+
v4sfx2 src_4 = _mm_load2u_ps((float *) (src) + j + 6 * SSE_LEN_FLOAT);
1566+
1567+
v8sf src_a_re = _mm256_set_m128(src_2.val[0], src_1.val[0]);
1568+
v8sf src_a_im = _mm256_set_m128(src_2.val[1], src_1.val[1]);
1569+
v8sf src_b_re = _mm256_set_m128(src_4.val[0], src_3.val[0]);
1570+
v8sf src_b_im = _mm256_set_m128(src_4.val[1], src_3.val[1]);
1571+
_mm256_storeu_ps(dst + i, atan2256f_ps(src_a_im, src_a_re));
1572+
_mm256_storeu_ps(dst + i + AVX_LEN_FLOAT, atan2256f_ps(src_b_im, src_b_re));
1573+
1574+
j += 4 * AVX_LEN_FLOAT;
1575+
}
1576+
}
1577+
1578+
for (int i = stop_len; i < len; i++) {
1579+
dst[i] = atan2f(src[i].im, src[i].re);
1580+
}
1581+
}
1582+
1583+
15381584
static inline v8sf asin256f_ps(v8sf xx)
15391585
{
15401586
v8sf a, x, z, z_tmp;

0 commit comments

Comments
 (0)