Skip to content

Commit 21ef042

Browse files
committed
improved minmax and minor corrections
1 parent f823830 commit 21ef042

13 files changed

+466
-348
lines changed

README.md

+4-2
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ The following table is a work in progress, "X" means there is not yet an impleme
121121
| atanh128f | atanh256f | atanh512f | atanhf_C | ippsAtanh_32f_A24 | X | X |
122122
| atan128f | atan256f | atan512f | atanf_C | ippsAtan_32f_A24 | X | X |
123123
| atan2128f | atan2256f | atan2512f | atan2f_C | ippsAtan2_32f_A24 | X | X |
124+
| atan2128f_interleaved | atan2256f_interleaved | X | atan2f_interleaved_C | X | X | X |
124125
| asin128f | asin256f | asin512f | asinf_C | ippsAsin_32f_A24 | X | X |
125126
| tanh128f | tanh256f | tanh512f | tanhf_C | ippsTanh_32f_A24 | X | X |
126127
| tan128f | tan256f | tan512f | tanf_C | ippsTan_32f_A24 | X | X |
@@ -140,6 +141,7 @@ The following table is a work in progress, "X" means there is not yet an impleme
140141
| cplxconjvecmul128f | cplxconjvecmul256f | cplxconjvecmul512f | cplxconjvecmul_C | ippsMulByConj_32fc_A24 | X | X |
141142
| cplxconjvecmul128f_split | cplxconjvecmul256f_split | cplxconjvecmul512f_split | cplxconjvecmul_C_split | X | X | X |
142143
| cplxconj128f | cplxconj256f | cplxconj512f | cplxconj_C | ippsConj_32fc_A24 | X | X |
144+
| cplxvecdiv12 | cplxvecdiv256f | cplxvecdiv512f | cplxvecdiv_C | X | X | X |
143145
| set128d | set256d | set512d | setd_C | ippsSet_64f | X | X |
144146
| zero128d | zero256d | zero512d | zerod_C | ippsZero_64f | X | X |
145147
| copy128d | copy256d | copy512d | copyd_C | ippsCopy_64f | X | X |
@@ -168,8 +170,8 @@ The following table is a work in progress, "X" means there is not yet an impleme
168170
| addc128s | addc256s | addc512s | addcs_C | X | addcs_vec | X |
169171
| vectorSlope128s | X | X | vectorSlopes_C | ippsVectorSlope_32s | X | X |
170172
| copy128s | copy256s | copy512s | copys_C | ippsCopy_32s | X | X |
171-
| absdiff16s_128s | vectorSlope256s | X | X | X | mulcs_vec | X |
172-
| powerspect16s_128s_interleaved | powerspect16s_256s_interleaved | X | ors_c | ippsOr_32u | X | X |
173+
| absdiff16s_128s | absdiff16s_256s | X | X | X | mulcs_vec | X |
174+
| X | X | X | ors_c | ippsOr_32u | X | X |
173175
| X | X | X | ands_c | ippsAnd_32u | X | X |
174176
| sigmoid128f | sigmoid256f | X | sigmoidf_C | X | X | X |
175177
| PRelu128f | PRelu256f | X | PReluf_C | X | X | PRelu128f |

simd_test.c

+23-7
Original file line numberDiff line numberDiff line change
@@ -5707,35 +5707,35 @@ for (int i = 0; i < len; i++){
57075707
/////////////////////////////////////////////////////////// ATANF2_INTERLEAVED /////////////////////////////////////////////////////
57085708
printf("ATANF2_INTERLEAVED\n");
57095709

5710-
for (int i = 0; i < 2*len; i++) {
5710+
for (int i = 0; i < 2 * len; i++) {
57115711
inout[i] = (float) (-1.0f * i + 0.15f) / 2.5f / (float) (5 * len);
57125712
inout_ref[i] = 50.0f;
57135713
inout2_ref[i] = 50.0f;
57145714
}
57155715

57165716
clock_gettime(CLOCK_REALTIME, &start);
5717-
atan2f_interleaved_C((complex32_t*)inout, inout_ref, len);
5717+
atan2f_interleaved_C((complex32_t *) inout, inout_ref, len);
57185718
clock_gettime(CLOCK_REALTIME, &stop);
57195719
elapsed = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3;
57205720
printf("atan2f_interleaved_C %d %lf\n", len, elapsed);
57215721

57225722
clock_gettime(CLOCK_REALTIME, &start);
57235723
for (l = 0; l < loop; l++)
5724-
atan2f_interleaved_C((complex32_t*)inout, inout_ref, len);
5724+
atan2f_interleaved_C((complex32_t *) inout, inout_ref, len);
57255725
clock_gettime(CLOCK_REALTIME, &stop);
57265726
elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
57275727
printf("atan2f_interleaved_C %d %lf\n", len, elapsed);
57285728

57295729
#ifdef SSE
57305730
clock_gettime(CLOCK_REALTIME, &start);
5731-
atan2128f_interleaved((complex32_t*)inout, inout2_ref, len);
5731+
atan2128f_interleaved((complex32_t *) inout, inout2_ref, len);
57325732
clock_gettime(CLOCK_REALTIME, &stop);
57335733
elapsed = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3;
57345734
printf("atan2128f_interleaved %d %lf\n", len, elapsed);
57355735

57365736
clock_gettime(CLOCK_REALTIME, &start);
57375737
for (l = 0; l < loop; l++)
5738-
atan2128f_interleaved((complex32_t*)inout, inout2_ref, len);
5738+
atan2128f_interleaved((complex32_t *) inout, inout2_ref, len);
57395739
clock_gettime(CLOCK_REALTIME, &stop);
57405740
elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
57415741
printf("atan2128f_interleaved %d %lf\n", len, elapsed);
@@ -5744,14 +5744,14 @@ for (int i = 0; i < len; i++){
57445744

57455745
#ifdef AVX
57465746
clock_gettime(CLOCK_REALTIME, &start);
5747-
atan2256f_interleaved((complex32_t*)inout, inout2_ref, len);
5747+
atan2256f_interleaved((complex32_t *) inout, inout2_ref, len);
57485748
clock_gettime(CLOCK_REALTIME, &stop);
57495749
elapsed = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3;
57505750
printf("atan2256f_interleaved %d %lf\n", len, elapsed);
57515751

57525752
clock_gettime(CLOCK_REALTIME, &start);
57535753
for (l = 0; l < loop; l++)
5754-
atan2256f_interleaved((complex32_t*)inout, inout2_ref, len);
5754+
atan2256f_interleaved((complex32_t *) inout, inout2_ref, len);
57555755
clock_gettime(CLOCK_REALTIME, &stop);
57565756
elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
57575757
printf("atan2256f_interleaved %d %lf\n", len, elapsed);
@@ -6277,6 +6277,22 @@ for (int i = 0; i < len; i++){
62776277
l2_errd(inoutd_ref, inoutd, len);
62786278
#endif
62796279

6280+
#ifdef AVX512
6281+
clock_gettime(CLOCK_REALTIME, &start);
6282+
vectorSlope512d(inoutd, len, 2.5, 3.0);
6283+
clock_gettime(CLOCK_REALTIME, &stop);
6284+
elapsed = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3;
6285+
printf("vectorSlope512d %d %lf\n", len, elapsed);
6286+
6287+
clock_gettime(CLOCK_REALTIME, &start);
6288+
for (l = 0; l < loop; l++)
6289+
vectorSlope512d(inoutd, len, 2.5, 3.0);
6290+
clock_gettime(CLOCK_REALTIME, &stop);
6291+
elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
6292+
printf("vectorSlope512d %d %lf\n", len, elapsed);
6293+
l2_errd(inoutd_ref, inoutd, len);
6294+
#endif
6295+
62806296
printf("\n");
62816297
/////////////////////////////////////////////////////////// SIGMOID //////////////////////////////////////////////////////////////////////////////
62826298
printf("SIGMOID\n");

simd_utils.h

+57-21
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ extern "C" {
2121

2222
#include <math.h>
2323
#include <stdint.h>
24+
#include <stdio.h>
2425

2526
static const float FOPI = 1.27323954473516f;
2627
static const float PIO4F = 0.7853981633974483096f;
@@ -60,25 +61,36 @@ static const int32_t inv_sign_mask = ~SIGN_MASK;
6061
#define IMM8_PERMUTE_128BITS_LANES 0x1 // reverse abcd efgh to efgh abcd
6162
#define M_PI 3.14159265358979323846
6263

63-
typedef struct {
64-
int16_t re;
65-
int16_t im;
64+
typedef union {
65+
struct {
66+
int16_t re;
67+
int16_t im;
68+
};
69+
int16_t c[2];
6670
} complex16s_t;
6771

68-
typedef struct {
69-
int32_t re;
70-
int32_t im;
72+
typedef union {
73+
struct {
74+
int32_t re;
75+
int32_t im;
76+
};
77+
int32_t c[2];
7178
} complex32s_t;
7279

73-
typedef struct {
74-
float re;
75-
float im;
80+
typedef union {
81+
struct {
82+
float re;
83+
float im;
84+
};
85+
float c[2];
7686
} complex32_t;
7787

78-
79-
typedef struct {
80-
double re;
81-
double im;
88+
typedef union {
89+
struct {
90+
double re;
91+
double im;
92+
};
93+
double c[2];
8294
} complex64_t;
8395

8496
typedef enum {
@@ -647,6 +659,30 @@ _PI256_64_CONST(2, 2);
647659
_PI256_64_CONST(4, 4);
648660
_PI256_64_CONST(0x7f, 0x7f);
649661

662+
typedef struct {
663+
v8sf val[2];
664+
} v8sfx2;
665+
666+
static inline v8sfx2 _mm256_load2_ps(float const *mem_addr)
667+
{
668+
v4sfx2 src_1 = _mm_load2_ps(mem_addr);
669+
v4sfx2 src_2 = _mm_load2_ps(mem_addr + 2 * SSE_LEN_FLOAT);
670+
v8sfx2 ret;
671+
ret.val[0] = _mm256_set_m128(src_2.val[0], src_1.val[0]);
672+
ret.val[1] = _mm256_set_m128(src_2.val[1], src_1.val[1]);
673+
return ret;
674+
}
675+
676+
static inline v8sfx2 _mm256_load2u_ps(float const *mem_addr)
677+
{
678+
v4sfx2 src_1 = _mm_load2u_ps(mem_addr);
679+
v4sfx2 src_2 = _mm_load2u_ps(mem_addr + 2 * SSE_LEN_FLOAT);
680+
v8sfx2 ret;
681+
ret.val[0] = _mm256_set_m128(src_2.val[0], src_1.val[0]);
682+
ret.val[1] = _mm256_set_m128(src_2.val[1], src_1.val[1]);
683+
return ret;
684+
}
685+
650686
#include "simd_utils_avx_double.h"
651687
#include "simd_utils_avx_float.h"
652688
#include "simd_utils_avx_int32.h"
@@ -907,23 +943,23 @@ static inline void fabsf_C(float *src, float *dst, int len)
907943
}
908944
}
909945

910-
static inline void setf_C(float *src, float value, int len)
946+
static inline void setf_C(float *dst, float value, int len)
911947
{
912948
#ifdef OMP
913949
#pragma omp simd
914950
#endif
915951
for (int i = 0; i < len; i++) {
916-
src[i] = value;
952+
dst[i] = value;
917953
}
918954
}
919955

920-
static inline void zerof_C(float *src, int len)
956+
static inline void zerof_C(float *dst, int len)
921957
{
922958
#ifdef OMP
923959
#pragma omp simd
924960
#endif
925961
for (int i = 0; i < len; i++) {
926-
src[i] = 0.0f;
962+
dst[i] = 0.0f;
927963
}
928964
}
929965

@@ -1777,23 +1813,23 @@ static inline void subs_c(int32_t *a, int32_t *b, int32_t *c, int len)
17771813
}*/
17781814

17791815

1780-
static inline void setd_C(double *src, double value, int len)
1816+
static inline void setd_C(double *dst, double value, int len)
17811817
{
17821818
#ifdef OMP
17831819
#pragma omp simd
17841820
#endif
17851821
for (int i = 0; i < len; i++) {
1786-
src[i] = value;
1822+
dst[i] = value;
17871823
}
17881824
}
17891825

1790-
static inline void zerod_C(double *src, int len)
1826+
static inline void zerod_C(double *dst, int len)
17911827
{
17921828
#ifdef OMP
17931829
#pragma omp simd
17941830
#endif
17951831
for (int i = 0; i < len; i++) {
1796-
src[i] = 0.0;
1832+
dst[i] = 0.0;
17971833
}
17981834
}
17991835

0 commit comments

Comments
 (0)