Skip to content

Commit 56ababe

Browse files
committed
removed some warnings, and added sum128s, and log2 functions
1 parent 08184b7 commit 56ababe

16 files changed

+273
-103
lines changed

avx512_mathfun.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ static inline v16sf exp512_ps(v16sf x)
208208
*/
209209
static inline v16sf sin512_ps(v16sf x)
210210
{ // any x
211-
v16sf xmm1, xmm2 = _mm512_setzero_ps(), xmm3, sign_bit, y;
211+
v16sf xmm3, sign_bit, y;
212212
v16si imm0, imm2;
213213

214214
#ifndef __AVX2__
@@ -322,7 +322,7 @@ static inline v16sf sin512_ps(v16sf x)
322322
/* almost the same as sin_ps */
323323
static inline v16sf cos512_ps(v16sf x)
324324
{ // any x
325-
v16sf xmm1, xmm2 = _mm512_setzero_ps(), xmm3, y;
325+
v16sf xmm3, y;
326326
v16si imm0, imm2;
327327

328328
#ifndef __AVX2__

avx_mathfun.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -834,7 +834,7 @@ static inline v8sf exp256_ps(v8sf x)
834834
*/
835835
static inline v8sf sin256_ps(v8sf x)
836836
{ // any x
837-
v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
837+
v8sf xmm3, sign_bit, y;
838838
v8si imm0, imm2;
839839

840840
#ifndef __AVX2__
@@ -948,7 +948,7 @@ static inline v8sf sin256_ps(v8sf x)
948948
/* almost the same as sin_ps */
949949
static inline v8sf cos256_ps(v8sf x)
950950
{ // any x
951-
v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
951+
v8sf xmm3, y;
952952
v8si imm0, imm2;
953953

954954
#ifndef __AVX2__

mysincosf.h

-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ static float coscof[] = {2.443315711809948E-005, -1.388731625493765E-003,
3030

3131
static inline int mysincosf(float xx, float *s, float *c)
3232
{
33-
float *p;
3433
float x, y, y1, y2, z;
3534
int j, sign_sin, sign_cos;
3635

neon_mathfun.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -427,7 +427,7 @@ static inline v4sf exp_ps(v4sf x)
427427
//FMA version
428428
static inline void sincos_ps(v4sf x, v4sf *ysin, v4sf *ycos)
429429
{ // any x
430-
v4sf xmm1, xmm2, xmm3, y;
430+
v4sf y;
431431

432432
v4su emm2;
433433

simd_test.c

+76-5
Original file line numberDiff line numberDiff line change
@@ -256,9 +256,9 @@ int main(int argc, char **argv)
256256
int16_t *inout_s1 = NULL, *inout_s2 = NULL;
257257
int32_t *inout_i1 = NULL, *inout_i2 = NULL, *inout_iref = NULL;
258258
int len = atoi(argv[1]);
259-
int ret;
260259

261260
#ifndef USE_MALLOC
261+
int ret;
262262
ret = posix_memalign((void **) &inout, atoi(argv[2]), 2 * len * sizeof(float));
263263
if (inout == NULL) {
264264
printf("ret = posix_memalign inout failed\n");
@@ -1583,7 +1583,7 @@ printf("\n");
15831583
printf("mean128f %d %lf\n", len, elapsed);
15841584

15851585
printf("mean %f ref %f\n", mean, mean_ref);
1586-
1586+
15871587
clock_gettime(CLOCK_REALTIME, &start);
15881588
meankahan128f(inout, &mean, len);
15891589
clock_gettime(CLOCK_REALTIME, &stop);
@@ -2042,7 +2042,7 @@ printf("\n");
20422042
elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
20432043
printf("cplxconjvecmul128f %d %lf %0.3lf GFlops/s\n", len, elapsed, flops / (elapsed * 1e3));
20442044

2045-
l2_err(inout_ref, inout2_ref, len);
2045+
l2_err(inout_ref, inout2_ref, 2*len);
20462046
#endif
20472047

20482048
#ifdef AVX
@@ -2059,7 +2059,7 @@ printf("\n");
20592059
elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
20602060
printf("cplxconjvecmul256f %d %lf %0.3lf GFlops/s\n", len, elapsed, flops / (elapsed * 1e3));
20612061

2062-
l2_err(inout_ref, inout2_ref, len);
2062+
l2_err(inout_ref, inout2_ref, 2*len);
20632063
#endif
20642064

20652065
#ifdef AVX512
@@ -2076,7 +2076,7 @@ printf("\n");
20762076
elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
20772077
printf("cplxconjvecmul512f %d %lf %0.3lf GFlops/s\n", len, elapsed, flops / (elapsed * 1e3));
20782078

2079-
l2_err(inout_ref, inout2_ref, len);
2079+
l2_err(inout_ref, inout2_ref, 2*len);
20802080
#endif
20812081
printf("\n");
20822082

@@ -3150,6 +3150,77 @@ printf("\n");
31503150
#endif
31513151

31523152

3153+
printf("\n");
3154+
/////////////////////////////////////////////////////////// LOG2 //////////////////////////////////////////////////////////////////////////////
3155+
printf("LOG2\n");
3156+
3157+
for (int i = 0; i < len; i++) {
3158+
inout[i] = (float) (1.0f * i + 0.000001f) / 1.82f;
3159+
inout_ref[i] = inout[i];
3160+
}
3161+
3162+
clock_gettime(CLOCK_REALTIME, &start);
3163+
log2f_C(inout, inout2_ref, len);
3164+
clock_gettime(CLOCK_REALTIME, &stop);
3165+
elapsed = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3;
3166+
printf("log2f_C %d %lf\n", len, elapsed);
3167+
3168+
clock_gettime(CLOCK_REALTIME, &start);
3169+
for (l = 0; l < loop; l++)
3170+
log2f_C(inout, inout2_ref, len);
3171+
clock_gettime(CLOCK_REALTIME, &stop);
3172+
elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
3173+
printf("log2f_C %d %lf\n", len, elapsed);
3174+
3175+
#ifdef SSE
3176+
clock_gettime(CLOCK_REALTIME, &start);
3177+
log2_128f(inout, inout2, len);
3178+
clock_gettime(CLOCK_REALTIME, &stop);
3179+
elapsed = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3;
3180+
printf("log2_128f %d %lf\n", len, elapsed);
3181+
3182+
clock_gettime(CLOCK_REALTIME, &start);
3183+
for (l = 0; l < loop; l++)
3184+
log2_128f(inout, inout2, len);
3185+
clock_gettime(CLOCK_REALTIME, &stop);
3186+
elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
3187+
printf("log2_128f %d %lf\n", len, elapsed);
3188+
l2_err(inout2_ref, inout2, len);
3189+
#endif
3190+
3191+
#ifdef AVX
3192+
clock_gettime(CLOCK_REALTIME, &start);
3193+
log2_256f(inout, inout2, len);
3194+
clock_gettime(CLOCK_REALTIME, &stop);
3195+
elapsed = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3;
3196+
printf("log2_256f %d %lf\n", len, elapsed);
3197+
3198+
clock_gettime(CLOCK_REALTIME, &start);
3199+
for (l = 0; l < loop; l++)
3200+
log2_256f(inout, inout2, len);
3201+
clock_gettime(CLOCK_REALTIME, &stop);
3202+
elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
3203+
printf("log2_256f %d %lf\n", len, elapsed);
3204+
l2_err(inout2_ref, inout2, len);
3205+
#endif
3206+
3207+
#ifdef AVX512
3208+
clock_gettime(CLOCK_REALTIME, &start);
3209+
log2_512f(inout, inout2, len);
3210+
clock_gettime(CLOCK_REALTIME, &stop);
3211+
elapsed = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3;
3212+
printf("log2_512f %d %lf\n", len, elapsed);
3213+
3214+
clock_gettime(CLOCK_REALTIME, &start);
3215+
for (l = 0; l < loop; l++)
3216+
log2_512f(inout, inout2, len);
3217+
clock_gettime(CLOCK_REALTIME, &stop);
3218+
elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
3219+
printf("log2_512f %d %lf\n", len, elapsed);
3220+
l2_err(inout2_ref, inout2, len);
3221+
#endif
3222+
3223+
31533224
printf("\n");
31543225
/////////////////////////////////////////////////////////// EXP //////////////////////////////////////////////////////////////////////////////
31553226
printf("EXP\n");

simd_utils.h

+11-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ extern "C" {
2424

2525
#include "mysincosf.h"
2626

27-
#define INVLN10 0.4342944819
27+
#define INVLN10 0.4342944819032518f //0.4342944819f
28+
#define INVLN2 1.4426950408889634f //1.44269504089f
2829
#define IMM8_FLIP_VEC 0x1B // change m128 from abcd to dcba
2930
#define IMM8_LO_HI_VEC 0x1E // change m128 from abcd to cdab
3031
#define IMM8_PERMUTE_128BITS_LANES 0x1 // reverse abcd efgh to efgh abcd
@@ -719,6 +720,15 @@ static inline void log10f_C(float *src, float *dst, int len)
719720
dst[i] = log10f(src[i]);
720721
}
721722

723+
static inline void log2f_C(float *src, float *dst, int len)
724+
{
725+
#ifdef OMP
726+
#pragma omp simd
727+
#endif
728+
for (int i = 0; i < len; i++)
729+
dst[i] = log2f(src[i]);
730+
}
731+
722732
static inline void lnf_C(float *src, float *dst, int len)
723733
{
724734
#ifdef OMP

simd_utils_avx512_double.h

+1-9
Original file line numberDiff line numberDiff line change
@@ -229,15 +229,13 @@ static inline void muladd512d(double *_a, double *_b, double *_c, double *dst, i
229229

230230
if (areAligned2((uintptr_t)(_a), (uintptr_t)(_b), AVX512_LEN_BYTES) &&
231231
areAligned2((uintptr_t)(_c), (uintptr_t)(dst), AVX512_LEN_BYTES)) {
232-
#pragma unroll(2)
233232
for (int i = 0; i < stop_len; i += AVX512_LEN_DOUBLE) {
234233
v8sd a = _mm512_load_pd(_a + i);
235234
v8sd b = _mm512_load_pd(_b + i);
236235
v8sd c = _mm512_load_pd(_c + i);
237236
_mm512_store_pd(dst + i, _mm512_fmadd_pd_custom(a, b, c));
238237
}
239238
} else {
240-
#pragma unroll(2)
241239
for (int i = 0; i < stop_len; i += AVX512_LEN_DOUBLE) {
242240
v8sd a = _mm512_loadu_pd(_a + i);
243241
v8sd b = _mm512_loadu_pd(_b + i);
@@ -259,14 +257,12 @@ static inline void mulcadd512d(double *_a, double _b, double *_c, double *dst, i
259257
stop_len *= AVX512_LEN_DOUBLE;
260258

261259
if (areAligned3((uintptr_t)(_a), (uintptr_t)(_c), (uintptr_t)(dst), AVX512_LEN_BYTES)) {
262-
#pragma unroll(2)
263260
for (int i = 0; i < stop_len; i += AVX512_LEN_DOUBLE) {
264261
v8sd a = _mm512_load_pd(_a + i);
265262
v8sd c = _mm512_load_pd(_c + i);
266263
_mm512_store_pd(dst + i, _mm512_fmadd_pd_custom(a, b, c));
267264
}
268265
} else {
269-
#pragma unroll(2)
270266
for (int i = 0; i < stop_len; i += AVX512_LEN_DOUBLE) {
271267
v8sd a = _mm512_loadu_pd(_a + i);
272268
v8sd c = _mm512_loadu_pd(_c + i);
@@ -288,13 +284,11 @@ static inline void mulcaddc512d(double *_a, double _b, double _c, double *dst, i
288284
stop_len *= AVX512_LEN_DOUBLE;
289285

290286
if (areAligned2((uintptr_t)(_a), (uintptr_t)(dst), AVX512_LEN_BYTES)) {
291-
#pragma unroll(2)
292287
for (int i = 0; i < stop_len; i += AVX512_LEN_DOUBLE) {
293288
v8sd a = _mm512_loadu_pd(_a + i);
294289
_mm512_store_pd(dst + i, _mm512_fmadd_pd_custom(a, b, c));
295290
}
296291
} else {
297-
#pragma unroll(2)
298292
for (int i = 0; i < stop_len; i += AVX512_LEN_DOUBLE) {
299293
v8sd a = _mm512_loadu_pd(_a + i);
300294
_mm512_storeu_pd(dst + i, _mm512_fmadd_pd_custom(a, b, c));
@@ -314,14 +308,12 @@ static inline void muladdc512d(double *_a, double *_b, double _c, double *dst, i
314308
stop_len *= AVX512_LEN_DOUBLE;
315309

316310
if (areAligned3((uintptr_t)(_a), (uintptr_t)(_b), (uintptr_t)(dst), AVX512_LEN_BYTES)) {
317-
#pragma unroll(2)
318311
for (int i = 0; i < stop_len; i += AVX512_LEN_DOUBLE) {
319312
v8sd a = _mm512_load_pd(_a + i);
320313
v8sd b = _mm512_load_pd(_b + i);
321314
_mm512_store_pd(dst + i, _mm512_fmadd_pd_custom(a, b, c));
322315
}
323316
} else {
324-
#pragma unroll(2)
325317
for (int i = 0; i < stop_len; i += AVX512_LEN_DOUBLE) {
326318
v8sd a = _mm512_loadu_pd(_a + i);
327319
v8sd b = _mm512_loadu_pd(_b + i);
@@ -464,7 +456,7 @@ static inline void vectorSlope512d(double *dst, int len, double offset, double s
464456

465457
static inline v8sd asin512_pd(v8sd x)
466458
{
467-
v8sd a, z, z_tmp;
459+
v8sd a, z;
468460
__mmask8 sign;
469461
__mmask8 ainfem8, asup0p625;
470462

simd_utils_avx512_float.h

+25-1
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,30 @@ static inline void log10_512f(float *src, float *dst, int len)
112112
}
113113
}
114114

115+
static inline void log2_512f(float *src, float *dst, int len)
116+
{
117+
const v16sf invln2f = _mm512_set1_ps((float) INVLN2); //_mm512_broadcast_ss(&invln10f_mask);
118+
119+
int stop_len = len / AVX512_LEN_FLOAT;
120+
stop_len *= AVX512_LEN_FLOAT;
121+
122+
if (((uintptr_t)(const void *) (src) % AVX512_LEN_BYTES) == 0) {
123+
for (int i = 0; i < stop_len; i += AVX512_LEN_FLOAT) {
124+
v16sf src_tmp = log512_ps(_mm512_load_ps(src + i));
125+
_mm512_store_ps(dst + i, _mm512_mul_ps(src_tmp, invln2f));
126+
}
127+
} else {
128+
for (int i = 0; i < stop_len; i += AVX512_LEN_FLOAT) {
129+
v16sf src_tmp = log512_ps(_mm512_loadu_ps(src + i));
130+
_mm512_storeu_ps(dst + i, _mm512_mul_ps(src_tmp, invln2f));
131+
}
132+
}
133+
134+
for (int i = stop_len; i < len; i++) {
135+
dst[i] = log10f(src[i]);
136+
}
137+
}
138+
115139
static inline void ln_512f(float *src, float *dst, int len)
116140
{
117141
int stop_len = len / AVX512_LEN_FLOAT;
@@ -1320,7 +1344,7 @@ static inline void asin512f(float *src, float *dst, int len)
13201344
static inline v16sf tanh512f_ps(v16sf xx)
13211345
{
13221346
v16sf x, z, z_first_branch, z_second_branch;
1323-
__mmask16 xxsup0, xxinf0, xsupmaxlogfdiv2, xsup0p625;
1347+
__mmask16 xxsup0, xsupmaxlogfdiv2, xsup0p625;
13241348

13251349
xxsup0 = _mm512_cmp_ps_mask(xx, _mm512_setzero_ps(), _CMP_GT_OS);
13261350
xsupmaxlogfdiv2 = _mm512_cmp_ps_mask(xx, *(v16sf *) _ps512_MAXLOGFDIV2, _CMP_GT_OS);

simd_utils_avx512_int32.h

+15-5
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,9 @@ static inline void copy512s(int32_t *src, int32_t *dst, int len)
9898
int stop_len = len / AVX512_LEN_INT32;
9999
stop_len *= AVX512_LEN_INT32;
100100

101-
#pragma omp parallel for schedule(auto) num_threads(NBTHREADS)
101+
#ifdef OMP
102+
#pragma omp parallel for schedule(auto)
103+
#endif
102104
for (int i = 0; i < stop_len; i += AVX512_LEN_INT32) {
103105
_mm512_store_si512((__m512i *) (dst + i), _mm512_load_si512((__m512i *) (src + i)));
104106
}
@@ -113,7 +115,9 @@ static inline void copy512s_2(int32_t *src, int32_t *dst, int len)
113115
int stop_len = len / (2 * AVX512_LEN_INT32);
114116
stop_len *= (2 * AVX512_LEN_INT32);
115117

116-
#pragma omp parallel for schedule(auto) num_threads(NBTHREADS)
118+
#ifdef OMP
119+
#pragma omp parallel for schedule(auto)
120+
#endif
117121
for (int i = 0; i < stop_len; i += 2 * AVX512_LEN_INT32) {
118122
__m512i tmp1 = _mm512_load_si512((__m512i *) (src + i));
119123
__m512i tmp2 = _mm512_load_si512((__m512i *) (src + i + AVX512_LEN_INT32));
@@ -131,7 +135,9 @@ static inline void fast_copy512s(int32_t *src, int32_t *dst, int len)
131135
int stop_len = len / AVX512_LEN_INT32;
132136
stop_len *= AVX512_LEN_INT32;
133137

134-
#pragma omp parallel for schedule(auto) num_threads(NBTHREADS)
138+
#ifdef OMP
139+
#pragma omp parallel for schedule(auto)
140+
#endif
135141
for (int i = 0; i < stop_len; i += AVX512_LEN_INT32) {
136142
_mm512_stream_si512((__m512i *) (dst + i), _mm512_stream_load_si512((__m512i *) (src + i)));
137143
}
@@ -148,7 +154,9 @@ static inline void fast_copy512s_2(int32_t *src, int32_t *dst, int len)
148154
int stop_len = len / (2 * AVX512_LEN_INT32);
149155
stop_len *= (2 * AVX512_LEN_INT32);
150156

151-
#pragma omp parallel for schedule(auto) num_threads(NBTHREADS)
157+
#ifdef OMP
158+
#pragma omp parallel for schedule(auto)
159+
#endif
152160
for (int i = 0; i < stop_len; i += 2 * AVX512_LEN_INT32) {
153161
__m512i tmp1 = _mm512_stream_load_si512((__m512i *) (src + i));
154162
__m512i tmp2 = _mm512_stream_load_si512((__m512i *) (src + i + AVX512_LEN_INT32));
@@ -167,7 +175,9 @@ static inline void fast_copy512s_4(int32_t *src, int32_t *dst, int len)
167175
int stop_len = len / (4 * AVX512_LEN_INT32);
168176
stop_len *= (4 * AVX512_LEN_INT32);
169177

170-
#pragma omp parallel for schedule(auto) num_threads(NBTHREADS)
178+
#ifdef OMP
179+
#pragma omp parallel for schedule(auto)
180+
#endif
171181
for (int i = 0; i < stop_len; i += 4 * AVX512_LEN_INT32) {
172182
__m512i tmp1 = _mm512_stream_load_si512((__m512i *) (src + i));
173183
__m512i tmp2 = _mm512_stream_load_si512((__m512i *) (src + i + AVX512_LEN_INT32));

0 commit comments

Comments
 (0)