@@ -1681,8 +1681,6 @@ static inline void minmax128f(float *src, int len, float *min_value, float *max_
1681
1681
stop_len *= (2 * SSE_LEN_FLOAT );
1682
1682
stop_len = (stop_len < 0 ) ? 0 : stop_len ;
1683
1683
1684
- float min_f [SSE_LEN_FLOAT ] __attribute__((aligned (SSE_LEN_BYTES )));
1685
- float max_f [SSE_LEN_FLOAT ] __attribute__((aligned (SSE_LEN_BYTES )));
1686
1684
v4sf max_v , min_v , max_v2 , min_v2 ;
1687
1685
v4sf src_tmp , src_tmp2 ;
1688
1686
@@ -1725,6 +1723,20 @@ static inline void minmax128f(float *src, int len, float *min_value, float *max_
1725
1723
max_v = _mm_max_ps (max_v , max_v2 );
1726
1724
min_v = _mm_min_ps (min_v , min_v2 );
1727
1725
1726
+ #if 1
1727
+ v4sf max3 = _mm_shuffle_ps (max_v , max_v , _MM_SHUFFLE (0 ,1 ,2 ,3 ));
1728
+ v4sf min3 = _mm_shuffle_ps (max_v , max_v , _MM_SHUFFLE (0 ,1 ,2 ,3 ));
1729
+ v4sf max4 = _mm_max_ps (max3 , max_v );
1730
+ v4sf min4 = _mm_min_ps (min3 , min_v );
1731
+ max3 = _mm_shuffle_ps (max4 , max4 , _MM_SHUFFLE (1 ,0 ,1 ,0 ));
1732
+ min3 = _mm_shuffle_ps (min4 , min4 , _MM_SHUFFLE (1 ,0 ,1 ,0 ));
1733
+ max4 = _mm_max_ps (max3 , max4 );
1734
+ min4 = _mm_min_ps (min3 , min4 );
1735
+ _mm_store_ss (& max_tmp , max4 );
1736
+ _mm_store_ss (& min_tmp , min4 );
1737
+ #else
1738
+ float min_f [SSE_LEN_FLOAT ] __attribute__((aligned (SSE_LEN_BYTES )));
1739
+ float max_f [SSE_LEN_FLOAT ] __attribute__((aligned (SSE_LEN_BYTES )));
1728
1740
_mm_store_ps (max_f , max_v );
1729
1741
_mm_store_ps (min_f , min_v );
1730
1742
@@ -1737,6 +1749,7 @@ static inline void minmax128f(float *src, int len, float *min_value, float *max_
1737
1749
min_tmp = min_tmp < min_f [1 ] ? min_tmp : min_f [1 ];
1738
1750
min_tmp = min_tmp < min_f [2 ] ? min_tmp : min_f [2 ];
1739
1751
min_tmp = min_tmp < min_f [3 ] ? min_tmp : min_f [3 ];
1752
+ #endif
1740
1753
}
1741
1754
1742
1755
for (int i = stop_len ; i < len ; i ++ ) {
@@ -3182,6 +3195,7 @@ static inline void sum128f(float *src, float *dst, int len)
3182
3195
vec_acc2 = _mm_add_ps (vec_acc2 , vec_tmp2 );
3183
3196
}
3184
3197
}
3198
+
3185
3199
vec_acc1 = _mm_add_ps (vec_acc1 , vec_acc2 );
3186
3200
_mm_store_ps (accumulate , vec_acc1 );
3187
3201
@@ -3190,7 +3204,6 @@ static inline void sum128f(float *src, float *dst, int len)
3190
3204
}
3191
3205
3192
3206
tmp_acc = tmp_acc + accumulate [0 ] + accumulate [1 ] + accumulate [2 ] + accumulate [3 ];
3193
-
3194
3207
* dst = tmp_acc ;
3195
3208
}
3196
3209
@@ -4410,8 +4423,8 @@ static inline void softmax128f(float *src, float *dst, int len)
4410
4423
int stop_len = len / (SSE_LEN_FLOAT );
4411
4424
stop_len *= (SSE_LEN_FLOAT );
4412
4425
4413
- __attribute__((aligned (SSE_LEN_BYTES ))) float accumulate [SSE_LEN_FLOAT ] = { 0.0f , 0.0f , 0.0f , 0.0f } ;
4414
- float acc = 0.0f ;
4426
+ __attribute__((aligned (SSE_LEN_BYTES ))) float accumulate [SSE_LEN_FLOAT ];
4427
+ float acc ;
4415
4428
4416
4429
v4sf vec_acc1 = _mm_setzero_ps (); // initialize the vector accumulator
4417
4430
@@ -4433,12 +4446,17 @@ static inline void softmax128f(float *src, float *dst, int len)
4433
4446
4434
4447
_mm_store_ps (accumulate , vec_acc1 );
4435
4448
4449
+ //From GCC _mm512_reduce_add_ps
4450
+ __m128 tmp1 = _mm_shuffle_ps (vec_acc1 , vec_acc1 , _MM_SHUFFLE ( 0 , 1 , 2 , 3 ));
4451
+ __m128 tmp2 = _mm_add_ps (tmp1 , vec_acc1 );
4452
+ _mm_store_ps (accumulate , tmp2 );
4453
+ acc = accumulate [0 ] + accumulate [1 ];
4454
+
4436
4455
for (int i = stop_len ; i < len ; i ++ ) {
4437
4456
dst [i ] = expf (src [i ]);
4438
4457
acc += dst [i ];
4439
4458
}
4440
4459
4441
- acc = acc + accumulate [0 ] + accumulate [1 ] + accumulate [2 ] + accumulate [3 ];
4442
4460
vec_acc1 = _mm_set1_ps (acc );
4443
4461
4444
4462
if (isAligned ((uintptr_t ) (dst ), SSE_LEN_BYTES )) {
@@ -4464,8 +4482,8 @@ static inline void softmax128f_dualacc(float *src, float *dst, int len)
4464
4482
int stop_len = len / (2 * SSE_LEN_FLOAT );
4465
4483
stop_len *= (2 * SSE_LEN_FLOAT );
4466
4484
4467
- __attribute__((aligned (SSE_LEN_BYTES ))) float accumulate [SSE_LEN_FLOAT ] = { 0.0f , 0.0f , 0.0f , 0.0f } ;
4468
- float acc = 0.0f ;
4485
+ __attribute__((aligned (SSE_LEN_BYTES ))) float accumulate [SSE_LEN_FLOAT ];
4486
+ float acc ;
4469
4487
4470
4488
v4sf vec_acc1 = _mm_setzero_ps (); // initialize the vector accumulator
4471
4489
v4sf vec_acc2 = _mm_setzero_ps (); // initialize the vector accumulator
@@ -4495,14 +4513,18 @@ static inline void softmax128f_dualacc(float *src, float *dst, int len)
4495
4513
}
4496
4514
4497
4515
vec_acc1 = _mm_add_ps (vec_acc1 , vec_acc2 );
4498
- _mm_store_ps (accumulate , vec_acc1 );
4516
+
4517
+ //From GCC _mm512_reduce_add_ps
4518
+ __m128 tmp1 = _mm_shuffle_ps (vec_acc1 , vec_acc1 , _MM_SHUFFLE ( 0 , 1 , 2 , 3 ));
4519
+ __m128 tmp2 = _mm_add_ps (tmp1 , vec_acc1 );
4520
+ _mm_store_ps (accumulate , tmp2 );
4521
+ acc = accumulate [0 ] + accumulate [1 ];
4499
4522
4500
4523
for (int i = stop_len ; i < len ; i ++ ) {
4501
4524
dst [i ] = expf (src [i ]);
4502
4525
acc += dst [i ];
4503
4526
}
4504
-
4505
- acc = acc + accumulate [0 ] + accumulate [1 ] + accumulate [2 ] + accumulate [3 ];
4527
+
4506
4528
vec_acc1 = _mm_set1_ps (acc );
4507
4529
4508
4530
if (areAligned2 ((uintptr_t ) (src ), (uintptr_t ) (dst ), SSE_LEN_BYTES )) {
0 commit comments