Skip to content

Commit 160c50f

Browse files
committed
minor SSE/AVX improvements
1 parent 6c630ac commit 160c50f

File tree

2 files changed

+59
-25
lines changed

2 files changed

+59
-25
lines changed

simd_utils_avx_float.h

+26-14
Original file line numberDiff line numberDiff line change
@@ -1635,12 +1635,12 @@ static inline void minmax256f(float *src, int len, float *min_value, float *max_
16351635
v4sf min4 = _mm256_extractf128_ps(min_v, 1);
16361636
max4 = _mm_max_ps(max3, max4);
16371637
min4 = _mm_min_ps(min3, min4);
1638-
max3 = _mm_permute_ps(max4, 0x0E);
1639-
min3 = _mm_permute_ps(min4, 0x0E);
1638+
max3 = _mm_shuffle_ps(max4, max4, _MM_SHUFFLE(0,1,2,3)); //max3 = _mm_permute_ps(max4, 0x0E);
1639+
min3 = _mm_shuffle_ps(min4, min4, _MM_SHUFFLE(0,1,2,3)); //min3 = _mm_permute_ps(min4, 0x0E);
16401640
max4 = _mm_max_ps(max3, max4);
16411641
min4 = _mm_min_ps(min3, min4);
1642-
max3 = _mm_permute_ps(max4, 0x01);
1643-
min3 = _mm_permute_ps(min4, 0x01);
1642+
max3 = _mm_shuffle_ps(max4, max4, _MM_SHUFFLE(1,0,1,0)); //max3 = _mm_permute_ps(max4, 0x01);
1643+
min3 = _mm_shuffle_ps(min4, min4, _MM_SHUFFLE(1,0,1,0)); //min3 = _mm_permute_ps(min4, 0x01);
16441644
max4 = _mm_max_ps(max3, max4);
16451645
min4 = _mm_min_ps(min3, min4);
16461646
_mm_store_ss(&max_tmp, max4);
@@ -3039,8 +3039,8 @@ static inline void sum256f(float *src, float *dst, int len)
30393039
int stop_len = len / (2 * AVX_LEN_FLOAT);
30403040
stop_len *= (2 * AVX_LEN_FLOAT);
30413041

3042-
__attribute__((aligned(AVX_LEN_BYTES))) float accumulate[AVX_LEN_FLOAT];
3043-
float tmp_acc = 0.0f;
3042+
__attribute__((aligned(SSE_LEN_BYTES))) float accumulate[SSE_LEN_FLOAT];
3043+
float tmp_acc;
30443044
v8sf vec_acc1 = _mm256_setzero_ps(); // initialize the vector accumulator
30453045
v8sf vec_acc2 = _mm256_setzero_ps(); // initialize the vector accumulator
30463046

@@ -3061,14 +3061,20 @@ static inline void sum256f(float *src, float *dst, int len)
30613061
}
30623062

30633063
vec_acc1 = _mm256_add_ps(vec_acc1, vec_acc2);
3064-
_mm256_store_ps(accumulate, vec_acc1);
3064+
3065+
//From GCC _mm512_reduce_add_ps
3066+
__m128 tmp1 = _mm256_extractf128_ps (vec_acc1, 1);
3067+
__m128 tmp2 = _mm256_extractf128_ps (vec_acc1, 0);
3068+
__m128 tmp3 = _mm_add_ps(tmp1, tmp2);
3069+
__m128 tmp4 = _mm_shuffle_ps(tmp3, tmp3, _MM_SHUFFLE( 0, 1, 2, 3));
3070+
__m128 tmp5 = _mm_add_ps(tmp3, tmp4);
3071+
_mm_store_ps(accumulate, tmp5);
3072+
tmp_acc = accumulate[0] + accumulate[1];
30653073

30663074
for (int i = stop_len; i < len; i++) {
30673075
tmp_acc += src[i];
30683076
}
30693077

3070-
tmp_acc = tmp_acc + accumulate[0] + accumulate[1] + accumulate[2] + accumulate[3] + accumulate[4] + accumulate[5] + accumulate[6] + accumulate[7];
3071-
30723078
*dst = tmp_acc;
30733079
}
30743080

@@ -3753,9 +3759,8 @@ static inline void softmax256f(float *src, float *dst, int len)
37533759
int stop_len = len / (AVX_LEN_FLOAT);
37543760
stop_len *= (AVX_LEN_FLOAT);
37553761

3756-
__attribute__((aligned(AVX_LEN_BYTES))) float accumulate[AVX_LEN_FLOAT] = {0.0f, 0.0f, 0.0f, 0.0f,
3757-
0.0f, 0.0f, 0.0f, 0.0f};
3758-
float acc = 0.0f;
3762+
__attribute__((aligned(SSE_LEN_BYTES))) float accumulate[SSE_LEN_FLOAT];
3763+
float acc;
37593764

37603765
v8sf vec_acc1 = _mm256_setzero_ps(); // initialize the vector accumulator
37613766

@@ -3776,14 +3781,21 @@ static inline void softmax256f(float *src, float *dst, int len)
37763781
}
37773782

37783783
_mm256_store_ps(accumulate, vec_acc1);
3784+
3785+
//From GCC _mm512_reduce_add_ps
3786+
__m128 tmp1 = _mm256_extractf128_ps (vec_acc1, 1);
3787+
__m128 tmp2 = _mm256_extractf128_ps (vec_acc1, 0);
3788+
__m128 tmp3 = _mm_add_ps(tmp1, tmp2);
3789+
__m128 tmp4 = _mm_shuffle_ps(tmp3, tmp3, _MM_SHUFFLE( 0, 1, 2, 3));
3790+
__m128 tmp5 = _mm_add_ps(tmp3, tmp4);
3791+
_mm_store_ps(accumulate, tmp5);
3792+
acc = accumulate[0] + accumulate[1];
37793793

37803794
for (int i = stop_len; i < len; i++) {
37813795
dst[i] = expf(src[i]);
37823796
acc += dst[i];
37833797
}
37843798

3785-
acc = acc + accumulate[0] + accumulate[1] + accumulate[2] + accumulate[3] +
3786-
accumulate[4] + accumulate[5] + accumulate[6] + accumulate[7];
37873799
vec_acc1 = _mm256_set1_ps(acc);
37883800

37893801
if (areAligned2((uintptr_t) (src), (uintptr_t) (dst), AVX_LEN_BYTES)) {

simd_utils_sse_float.h

+33-11
Original file line numberDiff line numberDiff line change
@@ -1681,8 +1681,6 @@ static inline void minmax128f(float *src, int len, float *min_value, float *max_
16811681
stop_len *= (2 * SSE_LEN_FLOAT);
16821682
stop_len = (stop_len < 0) ? 0 : stop_len;
16831683

1684-
float min_f[SSE_LEN_FLOAT] __attribute__((aligned(SSE_LEN_BYTES)));
1685-
float max_f[SSE_LEN_FLOAT] __attribute__((aligned(SSE_LEN_BYTES)));
16861684
v4sf max_v, min_v, max_v2, min_v2;
16871685
v4sf src_tmp, src_tmp2;
16881686

@@ -1725,6 +1723,20 @@ static inline void minmax128f(float *src, int len, float *min_value, float *max_
17251723
max_v = _mm_max_ps(max_v, max_v2);
17261724
min_v = _mm_min_ps(min_v, min_v2);
17271725

1726+
#if 1
1727+
v4sf max3 = _mm_shuffle_ps(max_v, max_v, _MM_SHUFFLE(0,1,2,3));
1728+
v4sf min3 = _mm_shuffle_ps(max_v, max_v, _MM_SHUFFLE(0,1,2,3));
1729+
v4sf max4 = _mm_max_ps(max3, max_v);
1730+
v4sf min4 = _mm_min_ps(min3, min_v);
1731+
max3 = _mm_shuffle_ps(max4, max4, _MM_SHUFFLE(1,0,1,0));
1732+
min3 = _mm_shuffle_ps(min4, min4, _MM_SHUFFLE(1,0,1,0));
1733+
max4 = _mm_max_ps(max3, max4);
1734+
min4 = _mm_min_ps(min3, min4);
1735+
_mm_store_ss(&max_tmp, max4);
1736+
_mm_store_ss(&min_tmp, min4);
1737+
#else
1738+
float min_f[SSE_LEN_FLOAT] __attribute__((aligned(SSE_LEN_BYTES)));
1739+
float max_f[SSE_LEN_FLOAT] __attribute__((aligned(SSE_LEN_BYTES)));
17281740
_mm_store_ps(max_f, max_v);
17291741
_mm_store_ps(min_f, min_v);
17301742

@@ -1737,6 +1749,7 @@ static inline void minmax128f(float *src, int len, float *min_value, float *max_
17371749
min_tmp = min_tmp < min_f[1] ? min_tmp : min_f[1];
17381750
min_tmp = min_tmp < min_f[2] ? min_tmp : min_f[2];
17391751
min_tmp = min_tmp < min_f[3] ? min_tmp : min_f[3];
1752+
#endif
17401753
}
17411754

17421755
for (int i = stop_len; i < len; i++) {
@@ -3182,6 +3195,7 @@ static inline void sum128f(float *src, float *dst, int len)
31823195
vec_acc2 = _mm_add_ps(vec_acc2, vec_tmp2);
31833196
}
31843197
}
3198+
31853199
vec_acc1 = _mm_add_ps(vec_acc1, vec_acc2);
31863200
_mm_store_ps(accumulate, vec_acc1);
31873201

@@ -3190,7 +3204,6 @@ static inline void sum128f(float *src, float *dst, int len)
31903204
}
31913205

31923206
tmp_acc = tmp_acc + accumulate[0] + accumulate[1] + accumulate[2] + accumulate[3];
3193-
31943207
*dst = tmp_acc;
31953208
}
31963209

@@ -4410,8 +4423,8 @@ static inline void softmax128f(float *src, float *dst, int len)
44104423
int stop_len = len / (SSE_LEN_FLOAT);
44114424
stop_len *= (SSE_LEN_FLOAT);
44124425

4413-
__attribute__((aligned(SSE_LEN_BYTES))) float accumulate[SSE_LEN_FLOAT] = {0.0f, 0.0f, 0.0f, 0.0f};
4414-
float acc = 0.0f;
4426+
__attribute__((aligned(SSE_LEN_BYTES))) float accumulate[SSE_LEN_FLOAT];
4427+
float acc;
44154428

44164429
v4sf vec_acc1 = _mm_setzero_ps(); // initialize the vector accumulator
44174430

@@ -4433,12 +4446,17 @@ static inline void softmax128f(float *src, float *dst, int len)
44334446

44344447
_mm_store_ps(accumulate, vec_acc1);
44354448

4449+
//From GCC _mm512_reduce_add_ps
4450+
__m128 tmp1 = _mm_shuffle_ps(vec_acc1, vec_acc1, _MM_SHUFFLE( 0, 1, 2, 3));
4451+
__m128 tmp2 = _mm_add_ps(tmp1, vec_acc1);
4452+
_mm_store_ps(accumulate, tmp2);
4453+
acc = accumulate[0] + accumulate[1];
4454+
44364455
for (int i = stop_len; i < len; i++) {
44374456
dst[i] = expf(src[i]);
44384457
acc += dst[i];
44394458
}
44404459

4441-
acc = acc + accumulate[0] + accumulate[1] + accumulate[2] + accumulate[3];
44424460
vec_acc1 = _mm_set1_ps(acc);
44434461

44444462
if (isAligned((uintptr_t) (dst), SSE_LEN_BYTES)) {
@@ -4464,8 +4482,8 @@ static inline void softmax128f_dualacc(float *src, float *dst, int len)
44644482
int stop_len = len / (2 * SSE_LEN_FLOAT);
44654483
stop_len *= (2 * SSE_LEN_FLOAT);
44664484

4467-
__attribute__((aligned(SSE_LEN_BYTES))) float accumulate[SSE_LEN_FLOAT] = {0.0f, 0.0f, 0.0f, 0.0f};
4468-
float acc = 0.0f;
4485+
__attribute__((aligned(SSE_LEN_BYTES))) float accumulate[SSE_LEN_FLOAT];
4486+
float acc;
44694487

44704488
v4sf vec_acc1 = _mm_setzero_ps(); // initialize the vector accumulator
44714489
v4sf vec_acc2 = _mm_setzero_ps(); // initialize the vector accumulator
@@ -4495,14 +4513,18 @@ static inline void softmax128f_dualacc(float *src, float *dst, int len)
44954513
}
44964514

44974515
vec_acc1 = _mm_add_ps(vec_acc1, vec_acc2);
4498-
_mm_store_ps(accumulate, vec_acc1);
4516+
4517+
//From GCC _mm512_reduce_add_ps
4518+
__m128 tmp1 = _mm_shuffle_ps(vec_acc1, vec_acc1, _MM_SHUFFLE( 0, 1, 2, 3));
4519+
__m128 tmp2 = _mm_add_ps(tmp1, vec_acc1);
4520+
_mm_store_ps(accumulate, tmp2);
4521+
acc = accumulate[0] + accumulate[1];
44994522

45004523
for (int i = stop_len; i < len; i++) {
45014524
dst[i] = expf(src[i]);
45024525
acc += dst[i];
45034526
}
4504-
4505-
acc = acc + accumulate[0] + accumulate[1] + accumulate[2] + accumulate[3];
4527+
45064528
vec_acc1 = _mm_set1_ps(acc);
45074529

45084530
if (areAligned2((uintptr_t) (src), (uintptr_t) (dst), SSE_LEN_BYTES)) {

0 commit comments

Comments
 (0)