@@ -30,24 +30,13 @@ template <typename vtype, typename reg_t = typename vtype::reg_t>
3030X86_SIMD_SORT_INLINE reg_t sort_ymm_32bit_half (reg_t ymm)
3131{
3232 using swizzle = typename vtype::swizzle_ops;
33-
34- const typename vtype::opmask_t oxAA
35- = vtype::seti (-1 , 0 , -1 , 0 );
36- const typename vtype::opmask_t oxCC
37- = vtype::seti (-1 , -1 , 0 , 0 );
38-
39- ymm = cmp_merge<vtype>(
40- ymm,
41- swizzle::template swap_n<vtype, 2 >(ymm),
42- oxAA);
43- ymm = cmp_merge<vtype>(
44- ymm,
45- vtype::reverse (ymm),
46- oxCC);
47- ymm = cmp_merge<vtype>(
48- ymm,
49- swizzle::template swap_n<vtype, 2 >(ymm),
50- oxAA);
33+
34+ const typename vtype::opmask_t oxAA = vtype::seti (-1 , 0 , -1 , 0 );
35+ const typename vtype::opmask_t oxCC = vtype::seti (-1 , -1 , 0 , 0 );
36+
37+ ymm = cmp_merge<vtype>(ymm, swizzle::template swap_n<vtype, 2 >(ymm), oxAA);
38+ ymm = cmp_merge<vtype>(ymm, vtype::reverse (ymm), oxCC);
39+ ymm = cmp_merge<vtype>(ymm, swizzle::template swap_n<vtype, 2 >(ymm), oxAA);
5140 return ymm;
5241}
5342
@@ -61,7 +50,7 @@ struct avx2_half_vector<int32_t> {
6150 using opmask_t = __m128i;
6251 static const uint8_t numlanes = 4 ;
6352 static constexpr simd_type vec_type = simd_type::AVX2;
64-
53+
6554 using swizzle_ops = avx2_32bit_half_swizzle_ops;
6655
6756 static type_t type_max ()
@@ -81,13 +70,11 @@ struct avx2_half_vector<int32_t> {
8170 auto mask = ((0x1ull << num_to_read) - 0x1ull );
8271 return convert_int_to_avx2_mask_half (mask);
8372 }
84- static ymmi_t
85- seti (int v1, int v2, int v3, int v4)
73+ static ymmi_t seti (int v1, int v2, int v3, int v4)
8674 {
8775 return _mm_set_epi32 (v1, v2, v3, v4);
8876 }
89- static reg_t
90- set (int v1, int v2, int v3, int v4)
77+ static reg_t set (int v1, int v2, int v3, int v4)
9178 {
9279 return _mm_set_epi32 (v1, v2, v3, v4);
9380 }
@@ -99,8 +86,8 @@ struct avx2_half_vector<int32_t> {
9986 {
10087 opmask_t equal = eq (x, y);
10188 opmask_t greater = _mm_cmpgt_epi32 (x, y);
102- return _mm_castps_si128 (_mm_or_ps ( _mm_castsi128_ps (equal),
103- _mm_castsi128_ps (greater)));
89+ return _mm_castps_si128 (
90+ _mm_or_ps ( _mm_castsi128_ps (equal), _mm_castsi128_ps (greater)));
10491 }
10592 static opmask_t eq (reg_t x, reg_t y)
10693 {
@@ -110,14 +97,12 @@ struct avx2_half_vector<int32_t> {
11097 static reg_t
11198 mask_i64gather (reg_t src, opmask_t mask, __m256i index, void const *base)
11299 {
113- return _mm256_mask_i64gather_epi32 (src, (const int *) base, index, mask, scale);
100+ return _mm256_mask_i64gather_epi32 (
101+ src, (const int *)base, index, mask, scale);
114102 }
115103 static reg_t i64gather (type_t *arr, arrsize_t *ind)
116104 {
117- return set (arr[ind[3 ]],
118- arr[ind[2 ]],
119- arr[ind[1 ]],
120- arr[ind[0 ]]);
105+ return set (arr[ind[3 ]], arr[ind[2 ]], arr[ind[1 ]], arr[ind[0 ]]);
121106 }
122107 static reg_t loadu (void const *mem)
123108 {
@@ -143,8 +128,8 @@ struct avx2_half_vector<int32_t> {
143128 static reg_t mask_mov (reg_t x, opmask_t mask, reg_t y)
144129 {
145130 return _mm_castps_si128 (_mm_blendv_ps (_mm_castsi128_ps (x),
146- _mm_castsi128_ps (y),
147- _mm_castsi128_ps (mask)));
131+ _mm_castsi128_ps (y),
132+ _mm_castsi128_ps (mask)));
148133 }
149134 static void mask_storeu (void *mem, opmask_t mask, reg_t x)
150135 {
@@ -217,7 +202,7 @@ struct avx2_half_vector<uint32_t> {
217202 using opmask_t = __m128i;
218203 static const uint8_t numlanes = 4 ;
219204 static constexpr simd_type vec_type = simd_type::AVX2;
220-
205+
221206 using swizzle_ops = avx2_32bit_half_swizzle_ops;
222207
223208 static type_t type_max ()
@@ -237,28 +222,24 @@ struct avx2_half_vector<uint32_t> {
237222 auto mask = ((0x1ull << num_to_read) - 0x1ull );
238223 return convert_int_to_avx2_mask_half (mask);
239224 }
240- static ymmi_t
241- seti (int v1, int v2, int v3, int v4)
225+ static ymmi_t seti (int v1, int v2, int v3, int v4)
242226 {
243227 return _mm_set_epi32 (v1, v2, v3, v4);
244228 }
245- static reg_t
246- set (int v1, int v2, int v3, int v4)
229+ static reg_t set (int v1, int v2, int v3, int v4)
247230 {
248231 return _mm_set_epi32 (v1, v2, v3, v4);
249232 }
250233 template <int scale>
251234 static reg_t
252235 mask_i64gather (reg_t src, opmask_t mask, __m256i index, void const *base)
253236 {
254- return _mm256_mask_i64gather_epi32 (src, (const int *) base, index, mask, scale);
237+ return _mm256_mask_i64gather_epi32 (
238+ src, (const int *)base, index, mask, scale);
255239 }
256240 static reg_t i64gather (type_t *arr, arrsize_t *ind)
257241 {
258- return set (arr[ind[3 ]],
259- arr[ind[2 ]],
260- arr[ind[1 ]],
261- arr[ind[0 ]]);
242+ return set (arr[ind[3 ]], arr[ind[2 ]], arr[ind[1 ]], arr[ind[0 ]]);
262243 }
263244 static opmask_t ge (reg_t x, reg_t y)
264245 {
@@ -289,8 +270,8 @@ struct avx2_half_vector<uint32_t> {
289270 static reg_t mask_mov (reg_t x, opmask_t mask, reg_t y)
290271 {
291272 return _mm_castps_si128 (_mm_blendv_ps (_mm_castsi128_ps (x),
292- _mm_castsi128_ps (y),
293- _mm_castsi128_ps (mask)));
273+ _mm_castsi128_ps (y),
274+ _mm_castsi128_ps (mask)));
294275 }
295276 static void mask_storeu (void *mem, opmask_t mask, reg_t x)
296277 {
@@ -363,7 +344,7 @@ struct avx2_half_vector<float> {
363344 using opmask_t = __m128i;
364345 static const uint8_t numlanes = 4 ;
365346 static constexpr simd_type vec_type = simd_type::AVX2;
366-
347+
367348 using swizzle_ops = avx2_32bit_half_swizzle_ops;
368349
369350 static type_t type_max ()
@@ -379,13 +360,11 @@ struct avx2_half_vector<float> {
379360 return _mm_set1_ps (type_max ());
380361 }
381362
382- static ymmi_t
383- seti (int v1, int v2, int v3, int v4)
363+ static ymmi_t seti (int v1, int v2, int v3, int v4)
384364 {
385365 return _mm_set_epi32 (v1, v2, v3, v4);
386366 }
387- static reg_t
388- set (float v1, float v2, float v3, float v4)
367+ static reg_t set (float v1, float v2, float v3, float v4)
389368 {
390369 return _mm_set_ps (v1, v2, v3, v4);
391370 }
@@ -424,14 +403,12 @@ struct avx2_half_vector<float> {
424403 static reg_t
425404 mask_i64gather (reg_t src, opmask_t mask, __m256i index, void const *base)
426405 {
427- return _mm256_mask_i64gather_ps (src, (const float *) base, index, _mm_castsi128_ps (mask), scale);
406+ return _mm256_mask_i64gather_ps (
407+ src, (const float *)base, index, _mm_castsi128_ps (mask), scale);
428408 }
429409 static reg_t i64gather (type_t *arr, arrsize_t *ind)
430410 {
431- return set (arr[ind[3 ]],
432- arr[ind[2 ]],
433- arr[ind[1 ]],
434- arr[ind[0 ]]);
411+ return set (arr[ind[3 ]], arr[ind[2 ]], arr[ind[1 ]], arr[ind[0 ]]);
435412 }
436413 static reg_t loadu (void const *mem)
437414 {
@@ -490,8 +467,7 @@ struct avx2_half_vector<float> {
490467 template <uint8_t mask>
491468 static reg_t shuffle (reg_t ymm)
492469 {
493- return _mm_castsi128_ps (
494- _mm_shuffle_epi32 (_mm_castps_si128 (ymm), mask));
470+ return _mm_castsi128_ps (_mm_shuffle_epi32 (_mm_castps_si128 (ymm), mask));
495471 }
496472 static void storeu (void *mem, reg_t x)
497473 {
@@ -566,9 +542,7 @@ struct avx2_32bit_half_swizzle_ops {
566542 __m128i v1 = vtype::cast_to (reg);
567543 __m128i v2 = vtype::cast_to (other);
568544
569- if constexpr (scale == 2 ) {
570- v1 = _mm_blend_epi32 (v1, v2, 0b0101 );
571- }
545+ if constexpr (scale == 2 ) { v1 = _mm_blend_epi32 (v1, v2, 0b0101 ); }
572546 else if constexpr (scale == 4 ) {
573547 v1 = _mm_blend_epi32 (v1, v2, 0b0011 );
574548 }
0 commit comments