Skip to content

Commit 5180d47

Browse files
committed
more unrolling, fixed unaligned flip functions, and upgrade to version 0.2.1
1 parent 279617e commit 5180d47

20 files changed

+545
-193
lines changed

avx512_mathfun.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Project : SIMD_Utils
3-
* Version : 0.2.0
3+
* Version : 0.2.1
44
* Author : JishinMaster
55
* Licence : BSD-2
66
*/

latencies.txt

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
SSE
2+
_mm_store_ps lat 1, cpi 1 (ivy ) 0.5 (broadwell), NEON vst1q_f32 lat 1 cpi 1 (a72)
3+
_mm_storeu_ps lat 1, cpi 1 (ivy ) 0.5 (broadwell)
4+
_mm_load_ps lat 1, cpi 1 (ivy ) 0.5 (broadwell) NEON vld1q_f32 lat 5 cpi 1 (a72)
5+
_mm_loadu_ps lat 1, cpi 1 (ivy ) 0.5 (broadwell)
6+
_mm_min_ps lat 3, cpi 1 (ivy ) 1 (broadwell) NEON vminq_f32 lat 3 cpi 0.5 (a72)
7+
_mm_max_ps lat 3, cpi 1 (ivy ) 1 (broadwell) NEON vmaxq_f32 lat 3 cpi 0.5 (a72)
8+
_mm_cvtpd_ps lat 4, cpi 1 (ivy ) 1 (broadwell) NEON vcvt_f32_f64 lat 3 cpi 1 (a72)
9+
_mm_mul_ps lat 5 (ivy) 3 (broadwell), cpi 1 (ivy) 0.5 (broadwell) NEON vmulq_f32 lat 4 cpi 1 (a72)
10+
_mm_div_ps lat 11-14 (ivy) <11 (broadwell), cpi 6 (ivy) 4 (broadwell) NEON vdivq_f32 lat 12-22 cpi 10-18 (a72)
11+
_mm_movelh_ps lat 1, cpi 1
12+
_mm_hadd_ps lat 5, cpi 2 => useful for reduction! NEON vpaddq_f32 lat lat 7 cpi 3/2 (a72)
13+
_mm_shuffle_ps lat 1, cpi 1 NEON uses gcc __builtin_shufflevector which might be VREV64 lat 3 cpi 0.5 VSWP lat 3 cpi 1 VEXT lat 3 cpu 0.5
14+
_mm_cvtps_epi32 lat 3, cpi 1
15+
_mm_round_ps
16+
_mm_castsi128_ps
17+
18+
neon blendv => vshrq_n_s32 (lat 3, cpi 1 on mask) +
19+
20+
AVX/AVX2
21+
_mm256_store_ps lat 1, cpi 1 (ivy ) 0.5 (broadwell)
22+
_mm256_storeu_ps lat 1, cpi 1 (ivy ) 0.5 (broadwell)
23+
_mm256_load_ps lat 1, cpi 1 (ivy ) 0.5 (broadwell)
24+
_mm256_loadu_ps lat 1, cpi 1 (ivy ) 0.5 (broadwell)
25+
_mm256_min_ps lat 3, cpi 1 (ivy ) 1 (broadwell)
26+
_mm256_max_ps lat 3, cpi 1 (ivy ) 1 (broadwell)
27+
_mm256_cvtpd_ps lat 4 (ivy) 6 (broadwell), cpi 1 (ivy ) 1 (broadwell)
28+
_mm256_mul_ps lat 5 (ivy) 3 (broadwell), cpi 1 (ivy) 0.5 (broadwell)
29+
_mm256_div_ps lat 18-21 (ivy) 13-17 (broadwell), cpi 14 (ivy) 10 (broadwell)
30+
_mm256_set_m128 lat 3, cpi 1
31+
_mm256_hadd_ps
32+
_mm256_permute_ps lat 1, cpi 1
33+
_mm256_permute2f128_ps lat 2(ivy) 3 (broadwell) , cpi 1
34+
35+
//AVX512/VNNI
36+
_mm512_dpbusd_epi32 lat 5?, cpi 1 (icelake)
37+
_mm512_madd_epi16 lat 5?, cpi 1 (icelake)
38+
_mm512_add_epi32 lat 1, cpi 0.5 (icelake)
39+
_mm512_maddubs_epi16 lat 5?, cpi 1 (icelake)

mysincosf.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Project : SIMD_Utils
3-
* Version : 0.2.0
3+
* Version : 0.2.1
44
* Author : JishinMaster
55
* Licence : BSD-2
66
*/

simd_test.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Project : SIMD_Utils
3-
* Version : 0.2.0
3+
* Version : 0.2.1
44
* Author : JishinMaster
55
* Licence : BSD-2
66
*/

simd_test_opencl.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Project : SIMD_Utils
3-
* Version : 0.2.0
3+
* Version : 0.2.1
44
* Author : JishinMaster
55
* Licence : BSD-2
66
*/

simd_utils.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Project : SIMD_Utils
3-
* Version : 0.2.0
3+
* Version : 0.2.1
44
* Author : JishinMaster
55
* Licence : BSD-2
66
*/

simd_utils_altivec_float.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Project : SIMD_Utils
3-
* Version : 0.2.0
3+
* Version : 0.2.1
44
* Author : JishinMaster
55
* Licence : BSD-2
66
*/

simd_utils_avx512_double.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Project : SIMD_Utils
3-
* Version : 0.2.0
3+
* Version : 0.2.1
44
* Author : JishinMaster
55
* Licence : BSD-2
66
*/

0 commit comments

Comments
 (0)