numpy
diff --git a/‎src/avx2-32bit-half.hpp‎
Lines changed: 33 additions & 59 deletions b/‎src/avx2-32bit-half.hpp‎
Lines changed: 33 additions & 59 deletions
diff --git a/‎src/avx2-64bit-qsort.hpp‎
Lines changed: 15 additions & 28 deletions b/‎src/avx2-64bit-qsort.hpp‎
Lines changed: 15 additions & 28 deletions
diff --git a/‎src/avx2-emu-funcs.hpp‎
Lines changed: 17 additions & 11 deletions b/‎src/avx2-emu-funcs.hpp‎
Lines changed: 17 additions & 11 deletions
@@ -30,24 +30,13 @@ template <typename vtype, typename reg_t = typename vtype::reg_t>
 X86_SIMD_SORT_INLINE reg_t sort_ymm_32bit_half(reg_t ymm)
 {
     using swizzle = typename vtype::swizzle_ops;
-    
-    const typename vtype::opmask_t oxAA
-            = vtype::seti(-1, 0, -1, 0);
-    const typename vtype::opmask_t oxCC
-            = vtype::seti(-1, -1, 0, 0);
-            
-    ymm = cmp_merge<vtype>(
-            ymm,
-            swizzle::template swap_n<vtype, 2>(ymm),
-            oxAA);
-    ymm = cmp_merge<vtype>(
-            ymm,
-            vtype::reverse(ymm),
-            oxCC);
-    ymm = cmp_merge<vtype>(
-            ymm,
-            swizzle::template swap_n<vtype, 2>(ymm),
-            oxAA);
+
+    const typename vtype::opmask_t oxAA = vtype::seti(-1, 0, -1, 0);
+    const typename vtype::opmask_t oxCC = vtype::seti(-1, -1, 0, 0);
+
+    ymm = cmp_merge<vtype>(ymm, swizzle::template swap_n<vtype, 2>(ymm), oxAA);
+    ymm = cmp_merge<vtype>(ymm, vtype::reverse(ymm), oxCC);
+    ymm = cmp_merge<vtype>(ymm, swizzle::template swap_n<vtype, 2>(ymm), oxAA);
     return ymm;
 }
 
@@ -61,7 +50,7 @@ struct avx2_half_vector<int32_t> {
     using opmask_t = __m128i;
     static const uint8_t numlanes = 4;
     static constexpr simd_type vec_type = simd_type::AVX2;
-    
+
     using swizzle_ops = avx2_32bit_half_swizzle_ops;
 
     static type_t type_max()
@@ -81,13 +70,11 @@ struct avx2_half_vector<int32_t> {
         auto mask = ((0x1ull << num_to_read) - 0x1ull);
         return convert_int_to_avx2_mask_half(mask);
     }
-    static ymmi_t
-    seti(int v1, int v2, int v3, int v4)
+    static ymmi_t seti(int v1, int v2, int v3, int v4)
     {
         return _mm_set_epi32(v1, v2, v3, v4);
     }
-    static reg_t
-    set(int v1, int v2, int v3, int v4)
+    static reg_t set(int v1, int v2, int v3, int v4)
     {
         return _mm_set_epi32(v1, v2, v3, v4);
     }
@@ -99,8 +86,8 @@ struct avx2_half_vector<int32_t> {
     {
         opmask_t equal = eq(x, y);
         opmask_t greater = _mm_cmpgt_epi32(x, y);
-        return _mm_castps_si128(_mm_or_ps(_mm_castsi128_ps(equal),
-                                                _mm_castsi128_ps(greater)));
+        return _mm_castps_si128(
+                _mm_or_ps(_mm_castsi128_ps(equal), _mm_castsi128_ps(greater)));
     }
     static opmask_t eq(reg_t x, reg_t y)
     {
@@ -110,14 +97,12 @@ struct avx2_half_vector<int32_t> {
     static reg_t
     mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base)
     {
-        return _mm256_mask_i64gather_epi32(src, (const int *) base, index, mask, scale);
+        return _mm256_mask_i64gather_epi32(
+                src, (const int *)base, index, mask, scale);
     }
     static reg_t i64gather(type_t *arr, arrsize_t *ind)
     {
-        return set(arr[ind[3]],
-                   arr[ind[2]],
-                   arr[ind[1]],
-                   arr[ind[0]]);
+        return set(arr[ind[3]], arr[ind[2]], arr[ind[1]], arr[ind[0]]);
     }
     static reg_t loadu(void const *mem)
     {
@@ -143,8 +128,8 @@ struct avx2_half_vector<int32_t> {
     static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y)
     {
         return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(x),
-                                                    _mm_castsi128_ps(y),
-                                                    _mm_castsi128_ps(mask)));
+                                              _mm_castsi128_ps(y),
+                                              _mm_castsi128_ps(mask)));
     }
     static void mask_storeu(void *mem, opmask_t mask, reg_t x)
     {
@@ -217,7 +202,7 @@ struct avx2_half_vector<uint32_t> {
     using opmask_t = __m128i;
     static const uint8_t numlanes = 4;
     static constexpr simd_type vec_type = simd_type::AVX2;
-    
+
     using swizzle_ops = avx2_32bit_half_swizzle_ops;
 
     static type_t type_max()
@@ -237,28 +222,24 @@ struct avx2_half_vector<uint32_t> {
         auto mask = ((0x1ull << num_to_read) - 0x1ull);
         return convert_int_to_avx2_mask_half(mask);
     }
-    static ymmi_t
-    seti(int v1, int v2, int v3, int v4)
+    static ymmi_t seti(int v1, int v2, int v3, int v4)
     {
         return _mm_set_epi32(v1, v2, v3, v4);
     }
-    static reg_t
-    set(int v1, int v2, int v3, int v4)
+    static reg_t set(int v1, int v2, int v3, int v4)
     {
         return _mm_set_epi32(v1, v2, v3, v4);
     }
     template <int scale>
     static reg_t
     mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base)
     {
-        return _mm256_mask_i64gather_epi32(src, (const int *) base, index, mask, scale);
+        return _mm256_mask_i64gather_epi32(
+                src, (const int *)base, index, mask, scale);
     }
     static reg_t i64gather(type_t *arr, arrsize_t *ind)
     {
-        return set(arr[ind[3]],
-                   arr[ind[2]],
-                   arr[ind[1]],
-                   arr[ind[0]]);
+        return set(arr[ind[3]], arr[ind[2]], arr[ind[1]], arr[ind[0]]);
     }
     static opmask_t ge(reg_t x, reg_t y)
     {
@@ -289,8 +270,8 @@ struct avx2_half_vector<uint32_t> {
     static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y)
     {
         return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(x),
-                                                    _mm_castsi128_ps(y),
-                                                    _mm_castsi128_ps(mask)));
+                                              _mm_castsi128_ps(y),
+                                              _mm_castsi128_ps(mask)));
     }
     static void mask_storeu(void *mem, opmask_t mask, reg_t x)
     {
@@ -363,7 +344,7 @@ struct avx2_half_vector<float> {
     using opmask_t = __m128i;
     static const uint8_t numlanes = 4;
     static constexpr simd_type vec_type = simd_type::AVX2;
-    
+
     using swizzle_ops = avx2_32bit_half_swizzle_ops;
 
     static type_t type_max()
@@ -379,13 +360,11 @@ struct avx2_half_vector<float> {
         return _mm_set1_ps(type_max());
     }
 
-    static ymmi_t
-    seti(int v1, int v2, int v3, int v4)
+    static ymmi_t seti(int v1, int v2, int v3, int v4)
     {
         return _mm_set_epi32(v1, v2, v3, v4);
     }
-    static reg_t
-    set(float v1, float v2, float v3, float v4)
+    static reg_t set(float v1, float v2, float v3, float v4)
     {
         return _mm_set_ps(v1, v2, v3, v4);
     }
@@ -424,14 +403,12 @@ struct avx2_half_vector<float> {
     static reg_t
     mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base)
     {
-        return _mm256_mask_i64gather_ps(src, (const float*) base, index, _mm_castsi128_ps(mask), scale);
+        return _mm256_mask_i64gather_ps(
+                src, (const float *)base, index, _mm_castsi128_ps(mask), scale);
     }
     static reg_t i64gather(type_t *arr, arrsize_t *ind)
     {
-        return set(arr[ind[3]],
-                   arr[ind[2]],
-                   arr[ind[1]],
-                   arr[ind[0]]);
+        return set(arr[ind[3]], arr[ind[2]], arr[ind[1]], arr[ind[0]]);
     }
     static reg_t loadu(void const *mem)
     {
@@ -490,8 +467,7 @@ struct avx2_half_vector<float> {
     template <uint8_t mask>
     static reg_t shuffle(reg_t ymm)
     {
-        return _mm_castsi128_ps(
-                _mm_shuffle_epi32(_mm_castps_si128(ymm), mask));
+        return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(ymm), mask));
     }
     static void storeu(void *mem, reg_t x)
     {
@@ -566,9 +542,7 @@ struct avx2_32bit_half_swizzle_ops {
         __m128i v1 = vtype::cast_to(reg);
         __m128i v2 = vtype::cast_to(other);
 
-        if constexpr (scale == 2) {
-            v1 = _mm_blend_epi32(v1, v2, 0b0101);
-        }
+        if constexpr (scale == 2) { v1 = _mm_blend_epi32(v1, v2, 0b0101); }
         else if constexpr (scale == 4) {
             v1 = _mm_blend_epi32(v1, v2, 0b0011);
         }
 
@@ -77,10 +77,7 @@ struct avx2_vector<int64_t> {
     {
         return _mm256_set_epi64x(v1, v2, v3, v4);
     }
-    static reg_t set(type_t v1,
-                     type_t v2,
-                     type_t v3,
-                     type_t v4)
+    static reg_t set(type_t v1, type_t v2, type_t v3, type_t v4)
     {
         return _mm256_set_epi64x(v1, v2, v3, v4);
     }
@@ -106,14 +103,12 @@ struct avx2_vector<int64_t> {
     static reg_t
     mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base)
     {
-        return _mm256_mask_i64gather_epi64(src, (const long long int *) base, index, mask, scale);
+        return _mm256_mask_i64gather_epi64(
+                src, (const long long int *)base, index, mask, scale);
     }
     static reg_t i64gather(type_t *arr, arrsize_t *ind)
     {
-        return set(arr[ind[3]],
-                   arr[ind[2]],
-                   arr[ind[1]],
-                   arr[ind[0]]);
+        return set(arr[ind[3]], arr[ind[2]], arr[ind[1]], arr[ind[0]]);
     }
     static reg_t loadu(void const *mem)
     {
@@ -246,25 +241,20 @@ struct avx2_vector<uint64_t> {
     {
         return _mm256_set_epi64x(v1, v2, v3, v4);
     }
-    static reg_t set(type_t v1,
-                     type_t v2,
-                     type_t v3,
-                     type_t v4)
+    static reg_t set(type_t v1, type_t v2, type_t v3, type_t v4)
     {
         return _mm256_set_epi64x(v1, v2, v3, v4);
     }
     template <int scale>
     static reg_t
     mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base)
     {
-        return _mm256_mask_i64gather_epi64(src, (const long long int *) base, index, mask, scale);
+        return _mm256_mask_i64gather_epi64(
+                src, (const long long int *)base, index, mask, scale);
     }
     static reg_t i64gather(type_t *arr, arrsize_t *ind)
     {
-        return set(arr[ind[3]],
-                   arr[ind[2]],
-                   arr[ind[1]],
-                   arr[ind[0]]);
+        return set(arr[ind[3]], arr[ind[2]], arr[ind[1]], arr[ind[0]]);
     }
     static opmask_t gt(reg_t x, reg_t y)
     {
@@ -427,10 +417,7 @@ struct avx2_vector<double> {
     {
         return _mm256_set_epi64x(v1, v2, v3, v4);
     }
-    static reg_t set(type_t v1,
-                     type_t v2,
-                     type_t v3,
-                     type_t v4)
+    static reg_t set(type_t v1, type_t v2, type_t v3, type_t v4)
     {
         return _mm256_set_pd(v1, v2, v3, v4);
     }
@@ -450,16 +437,16 @@ struct avx2_vector<double> {
     static reg_t
     mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base)
     {
-        return _mm256_mask_i64gather_pd(
-                src, (const type_t *) base, index, _mm256_castsi256_pd(mask), scale);
+        return _mm256_mask_i64gather_pd(src,
+                                        (const type_t *)base,
+                                        index,
+                                        _mm256_castsi256_pd(mask),
+                                        scale);
         ;
     }
     static reg_t i64gather(type_t *arr, arrsize_t *ind)
     {
-        return set(arr[ind[3]],
-                   arr[ind[2]],
-                   arr[ind[1]],
-                   arr[ind[0]]);
+        return set(arr[ind[3]], arr[ind[2]], arr[ind[1]], arr[ind[0]]);
     }
     static reg_t loadu(void const *mem)
     {
 
@@ -107,8 +107,10 @@ constexpr auto avx2_compressstore_lut32_half_gen = [] {
     return lutPair;
 }();
 
-constexpr auto avx2_compressstore_lut32_half_perm = avx2_compressstore_lut32_half_gen[0];
-constexpr auto avx2_compressstore_lut32_half_left = avx2_compressstore_lut32_half_gen[1];
+constexpr auto avx2_compressstore_lut32_half_perm
+        = avx2_compressstore_lut32_half_gen[0];
+constexpr auto avx2_compressstore_lut32_half_left
+        = avx2_compressstore_lut32_half_gen[1];
 
 constexpr auto avx2_compressstore_lut64_gen = [] {
     std::array<std::array<int32_t, 8>, 16> permLut {};
@@ -281,19 +283,22 @@ void avx2_emu_mask_compressstoreu32(void *base_addr,
 }
 
 template <typename T>
-void avx2_emu_mask_compressstoreu32_half(void *base_addr,
-                                    typename avx2_half_vector<T>::opmask_t k,
-                                    typename avx2_half_vector<T>::reg_t reg)
+void avx2_emu_mask_compressstoreu32_half(
+        void *base_addr,
+        typename avx2_half_vector<T>::opmask_t k,
+        typename avx2_half_vector<T>::reg_t reg)
 {
     using vtype = avx2_half_vector<T>;
 
     T *leftStore = (T *)base_addr;
 
     int32_t shortMask = convert_avx2_mask_to_int_half(k);
     const __m128i &perm = _mm_loadu_si128(
-            (const __m128i *)avx2_compressstore_lut32_half_perm[shortMask].data());
+            (const __m128i *)avx2_compressstore_lut32_half_perm[shortMask]
+                    .data());
     const __m128i &left = _mm_loadu_si128(
-            (const __m128i *)avx2_compressstore_lut32_half_left[shortMask].data());
+            (const __m128i *)avx2_compressstore_lut32_half_left[shortMask]
+                    .data());
 
     typename vtype::reg_t temp = vtype::permutevar(reg, perm);
 
@@ -346,9 +351,9 @@ int avx2_double_compressstore32(void *left_addr,
 
 template <typename T>
 int avx2_double_compressstore32_half(void *left_addr,
-                                void *right_addr,
-                                typename avx2_half_vector<T>::opmask_t k,
-                                typename avx2_half_vector<T>::reg_t reg)
+                                     void *right_addr,
+                                     typename avx2_half_vector<T>::opmask_t k,
+                                     typename avx2_half_vector<T>::reg_t reg)
 {
     using vtype = avx2_half_vector<T>;
 
@@ -357,7 +362,8 @@ int avx2_double_compressstore32_half(void *left_addr,
 
     int32_t shortMask = convert_avx2_mask_to_int_half(k);
     const __m128i &perm = _mm_loadu_si128(
-            (const __m128i *)avx2_compressstore_lut32_half_perm[shortMask].data());
+            (const __m128i *)avx2_compressstore_lut32_half_perm[shortMask]
+                    .data());
 
     typename vtype::reg_t temp = vtype::permutevar(reg, perm);
Original file line number	Diff line number	Diff line change
`@@ -30,24 +30,13 @@ template <typename vtype, typename reg_t = typename vtype::reg_t>`
`30`	`30`	`X86_SIMD_SORT_INLINE reg_t sort_ymm_32bit_half(reg_t ymm)`
`31`	`31`	`{`
`32`	`32`	`using swizzle = typename vtype::swizzle_ops;`
`33`		`-`
`34`		`- const typename vtype::opmask_t oxAA`
`35`		`- = vtype::seti(-1, 0, -1, 0);`
`36`		`- const typename vtype::opmask_t oxCC`
`37`		`- = vtype::seti(-1, -1, 0, 0);`
`38`		`-`
`39`		`- ymm = cmp_merge<vtype>(`
`40`		`- ymm,`
`41`		`- swizzle::template swap_n<vtype, 2>(ymm),`
`42`		`- oxAA);`
`43`		`- ymm = cmp_merge<vtype>(`
`44`		`- ymm,`
`45`		`- vtype::reverse(ymm),`
`46`		`- oxCC);`
`47`		`- ymm = cmp_merge<vtype>(`
`48`		`- ymm,`
`49`		`- swizzle::template swap_n<vtype, 2>(ymm),`
`50`		`- oxAA);`
	`33`	`+`
	`34`	`+ const typename vtype::opmask_t oxAA = vtype::seti(-1, 0, -1, 0);`
	`35`	`+ const typename vtype::opmask_t oxCC = vtype::seti(-1, -1, 0, 0);`
	`36`	`+`
	`37`	`+ ymm = cmp_merge<vtype>(ymm, swizzle::template swap_n<vtype, 2>(ymm), oxAA);`
	`38`	`+ ymm = cmp_merge<vtype>(ymm, vtype::reverse(ymm), oxCC);`
	`39`	`+ ymm = cmp_merge<vtype>(ymm, swizzle::template swap_n<vtype, 2>(ymm), oxAA);`
`51`	`40`	`return ymm;`
`52`	`41`	`}`
`53`	`42`
`@@ -61,7 +50,7 @@ struct avx2_half_vector<int32_t> {`
`61`	`50`	`using opmask_t = __m128i;`
`62`	`51`	`static const uint8_t numlanes = 4;`
`63`	`52`	`static constexpr simd_type vec_type = simd_type::AVX2;`
`64`		`-`
	`53`	`+`
`65`	`54`	`using swizzle_ops = avx2_32bit_half_swizzle_ops;`
`66`	`55`
`67`	`56`	`static type_t type_max()`
`@@ -81,13 +70,11 @@ struct avx2_half_vector<int32_t> {`
`81`	`70`	`auto mask = ((0x1ull << num_to_read) - 0x1ull);`
`82`	`71`	`return convert_int_to_avx2_mask_half(mask);`
`83`	`72`	`}`
`84`		`- static ymmi_t`
`85`		`- seti(int v1, int v2, int v3, int v4)`
	`73`	`+ static ymmi_t seti(int v1, int v2, int v3, int v4)`
`86`	`74`	`{`
`87`	`75`	`return _mm_set_epi32(v1, v2, v3, v4);`
`88`	`76`	`}`
`89`		`- static reg_t`
`90`		`- set(int v1, int v2, int v3, int v4)`
	`77`	`+ static reg_t set(int v1, int v2, int v3, int v4)`
`91`	`78`	`{`
`92`	`79`	`return _mm_set_epi32(v1, v2, v3, v4);`
`93`	`80`	`}`
`@@ -99,8 +86,8 @@ struct avx2_half_vector<int32_t> {`
`99`	`86`	`{`
`100`	`87`	`opmask_t equal = eq(x, y);`
`101`	`88`	`opmask_t greater = _mm_cmpgt_epi32(x, y);`
`102`		`- return _mm_castps_si128(_mm_or_ps(_mm_castsi128_ps(equal),`
`103`		`- _mm_castsi128_ps(greater)));`
	`89`	`+ return _mm_castps_si128(`
	`90`	`+ _mm_or_ps(_mm_castsi128_ps(equal), _mm_castsi128_ps(greater)));`
`104`	`91`	`}`
`105`	`92`	`static opmask_t eq(reg_t x, reg_t y)`
`106`	`93`	`{`
`@@ -110,14 +97,12 @@ struct avx2_half_vector<int32_t> {`
`110`	`97`	`static reg_t`
`111`	`98`	`mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base)`
`112`	`99`	`{`
`113`		`- return _mm256_mask_i64gather_epi32(src, (const int *) base, index, mask, scale);`
	`100`	`+ return _mm256_mask_i64gather_epi32(`
	`101`	`+ src, (const int *)base, index, mask, scale);`
`114`	`102`	`}`
`115`	`103`	`static reg_t i64gather(type_t arr, arrsize_t ind)`
`116`	`104`	`{`
`117`		`- return set(arr[ind[3]],`
`118`		`- arr[ind[2]],`
`119`		`- arr[ind[1]],`
`120`		`- arr[ind[0]]);`
	`105`	`+ return set(arr[ind[3]], arr[ind[2]], arr[ind[1]], arr[ind[0]]);`
`121`	`106`	`}`
`122`	`107`	`static reg_t loadu(void const *mem)`
`123`	`108`	`{`
`@@ -143,8 +128,8 @@ struct avx2_half_vector<int32_t> {`
`143`	`128`	`static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y)`
`144`	`129`	`{`
`145`	`130`	`return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(x),`
`146`		`- _mm_castsi128_ps(y),`
`147`		`- _mm_castsi128_ps(mask)));`
	`131`	`+ _mm_castsi128_ps(y),`
	`132`	`+ _mm_castsi128_ps(mask)));`
`148`	`133`	`}`
`149`	`134`	`static void mask_storeu(void *mem, opmask_t mask, reg_t x)`
`150`	`135`	`{`
`@@ -217,7 +202,7 @@ struct avx2_half_vector<uint32_t> {`
`217`	`202`	`using opmask_t = __m128i;`
`218`	`203`	`static const uint8_t numlanes = 4;`
`219`	`204`	`static constexpr simd_type vec_type = simd_type::AVX2;`
`220`		`-`
	`205`	`+`
`221`	`206`	`using swizzle_ops = avx2_32bit_half_swizzle_ops;`
`222`	`207`
`223`	`208`	`static type_t type_max()`
`@@ -237,28 +222,24 @@ struct avx2_half_vector<uint32_t> {`
`237`	`222`	`auto mask = ((0x1ull << num_to_read) - 0x1ull);`
`238`	`223`	`return convert_int_to_avx2_mask_half(mask);`
`239`	`224`	`}`
`240`		`- static ymmi_t`
`241`		`- seti(int v1, int v2, int v3, int v4)`
	`225`	`+ static ymmi_t seti(int v1, int v2, int v3, int v4)`
`242`	`226`	`{`
`243`	`227`	`return _mm_set_epi32(v1, v2, v3, v4);`
`244`	`228`	`}`
`245`		`- static reg_t`
`246`		`- set(int v1, int v2, int v3, int v4)`
	`229`	`+ static reg_t set(int v1, int v2, int v3, int v4)`
`247`	`230`	`{`
`248`	`231`	`return _mm_set_epi32(v1, v2, v3, v4);`
`249`	`232`	`}`
`250`	`233`	`template <int scale>`
`251`	`234`	`static reg_t`
`252`	`235`	`mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base)`
`253`	`236`	`{`
`254`		`- return _mm256_mask_i64gather_epi32(src, (const int *) base, index, mask, scale);`
	`237`	`+ return _mm256_mask_i64gather_epi32(`
	`238`	`+ src, (const int *)base, index, mask, scale);`
`255`	`239`	`}`
`256`	`240`	`static reg_t i64gather(type_t arr, arrsize_t ind)`
`257`	`241`	`{`
`258`		`- return set(arr[ind[3]],`
`259`		`- arr[ind[2]],`
`260`		`- arr[ind[1]],`
`261`		`- arr[ind[0]]);`
	`242`	`+ return set(arr[ind[3]], arr[ind[2]], arr[ind[1]], arr[ind[0]]);`
`262`	`243`	`}`
`263`	`244`	`static opmask_t ge(reg_t x, reg_t y)`
`264`	`245`	`{`
`@@ -289,8 +270,8 @@ struct avx2_half_vector<uint32_t> {`
`289`	`270`	`static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y)`
`290`	`271`	`{`
`291`	`272`	`return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(x),`
`292`		`- _mm_castsi128_ps(y),`
`293`		`- _mm_castsi128_ps(mask)));`
	`273`	`+ _mm_castsi128_ps(y),`
	`274`	`+ _mm_castsi128_ps(mask)));`
`294`	`275`	`}`
`295`	`276`	`static void mask_storeu(void *mem, opmask_t mask, reg_t x)`
`296`	`277`	`{`
`@@ -363,7 +344,7 @@ struct avx2_half_vector<float> {`
`363`	`344`	`using opmask_t = __m128i;`
`364`	`345`	`static const uint8_t numlanes = 4;`
`365`	`346`	`static constexpr simd_type vec_type = simd_type::AVX2;`
`366`		`-`
	`347`	`+`
`367`	`348`	`using swizzle_ops = avx2_32bit_half_swizzle_ops;`
`368`	`349`
`369`	`350`	`static type_t type_max()`
`@@ -379,13 +360,11 @@ struct avx2_half_vector<float> {`
`379`	`360`	`return _mm_set1_ps(type_max());`
`380`	`361`	`}`
`381`	`362`
`382`		`- static ymmi_t`
`383`		`- seti(int v1, int v2, int v3, int v4)`
	`363`	`+ static ymmi_t seti(int v1, int v2, int v3, int v4)`
`384`	`364`	`{`
`385`	`365`	`return _mm_set_epi32(v1, v2, v3, v4);`
`386`	`366`	`}`
`387`		`- static reg_t`
`388`		`- set(float v1, float v2, float v3, float v4)`
	`367`	`+ static reg_t set(float v1, float v2, float v3, float v4)`
`389`	`368`	`{`
`390`	`369`	`return _mm_set_ps(v1, v2, v3, v4);`
`391`	`370`	`}`
`@@ -424,14 +403,12 @@ struct avx2_half_vector<float> {`
`424`	`403`	`static reg_t`
`425`	`404`	`mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base)`
`426`	`405`	`{`
`427`		`- return _mm256_mask_i64gather_ps(src, (const float*) base, index, _mm_castsi128_ps(mask), scale);`
	`406`	`+ return _mm256_mask_i64gather_ps(`
	`407`	`+ src, (const float *)base, index, _mm_castsi128_ps(mask), scale);`
`428`	`408`	`}`
`429`	`409`	`static reg_t i64gather(type_t arr, arrsize_t ind)`
`430`	`410`	`{`
`431`		`- return set(arr[ind[3]],`
`432`		`- arr[ind[2]],`
`433`		`- arr[ind[1]],`
`434`		`- arr[ind[0]]);`
	`411`	`+ return set(arr[ind[3]], arr[ind[2]], arr[ind[1]], arr[ind[0]]);`
`435`	`412`	`}`
`436`	`413`	`static reg_t loadu(void const *mem)`
`437`	`414`	`{`
`@@ -490,8 +467,7 @@ struct avx2_half_vector<float> {`
`490`	`467`	`template <uint8_t mask>`
`491`	`468`	`static reg_t shuffle(reg_t ymm)`
`492`	`469`	`{`
`493`		`- return _mm_castsi128_ps(`
`494`		`- _mm_shuffle_epi32(_mm_castps_si128(ymm), mask));`
	`470`	`+ return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(ymm), mask));`
`495`	`471`	`}`
`496`	`472`	`static void storeu(void *mem, reg_t x)`
`497`	`473`	`{`
`@@ -566,9 +542,7 @@ struct avx2_32bit_half_swizzle_ops {`
`566`	`542`	`__m128i v1 = vtype::cast_to(reg);`
`567`	`543`	`__m128i v2 = vtype::cast_to(other);`
`568`	`544`
`569`		`- if constexpr (scale == 2) {`
`570`		`- v1 = _mm_blend_epi32(v1, v2, 0b0101);`
`571`		`- }`
	`545`	`+ if constexpr (scale == 2) { v1 = _mm_blend_epi32(v1, v2, 0b0101); }`
`572`	`546`	`else if constexpr (scale == 4) {`
`573`	`547`	`v1 = _mm_blend_epi32(v1, v2, 0b0011);`
`574`	`548`	`}`
Original file line number	Diff line number	Diff line change
`@@ -77,10 +77,7 @@ struct avx2_vector<int64_t> {`
`77`	`77`	`{`
`78`	`78`	`return _mm256_set_epi64x(v1, v2, v3, v4);`
`79`	`79`	`}`
`80`		`- static reg_t set(type_t v1,`
`81`		`- type_t v2,`
`82`		`- type_t v3,`
`83`		`- type_t v4)`
	`80`	`+ static reg_t set(type_t v1, type_t v2, type_t v3, type_t v4)`
`84`	`81`	`{`
`85`	`82`	`return _mm256_set_epi64x(v1, v2, v3, v4);`
`86`	`83`	`}`
`@@ -106,14 +103,12 @@ struct avx2_vector<int64_t> {`
`106`	`103`	`static reg_t`
`107`	`104`	`mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base)`
`108`	`105`	`{`
`109`		`- return _mm256_mask_i64gather_epi64(src, (const long long int *) base, index, mask, scale);`
	`106`	`+ return _mm256_mask_i64gather_epi64(`
	`107`	`+ src, (const long long int *)base, index, mask, scale);`
`110`	`108`	`}`
`111`	`109`	`static reg_t i64gather(type_t arr, arrsize_t ind)`
`112`	`110`	`{`
`113`		`- return set(arr[ind[3]],`
`114`		`- arr[ind[2]],`
`115`		`- arr[ind[1]],`
`116`		`- arr[ind[0]]);`
	`111`	`+ return set(arr[ind[3]], arr[ind[2]], arr[ind[1]], arr[ind[0]]);`
`117`	`112`	`}`
`118`	`113`	`static reg_t loadu(void const *mem)`
`119`	`114`	`{`
`@@ -246,25 +241,20 @@ struct avx2_vector<uint64_t> {`
`246`	`241`	`{`
`247`	`242`	`return _mm256_set_epi64x(v1, v2, v3, v4);`
`248`	`243`	`}`
`249`		`- static reg_t set(type_t v1,`
`250`		`- type_t v2,`
`251`		`- type_t v3,`
`252`		`- type_t v4)`
	`244`	`+ static reg_t set(type_t v1, type_t v2, type_t v3, type_t v4)`
`253`	`245`	`{`
`254`	`246`	`return _mm256_set_epi64x(v1, v2, v3, v4);`
`255`	`247`	`}`
`256`	`248`	`template <int scale>`
`257`	`249`	`static reg_t`
`258`	`250`	`mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base)`
`259`	`251`	`{`
`260`		`- return _mm256_mask_i64gather_epi64(src, (const long long int *) base, index, mask, scale);`
	`252`	`+ return _mm256_mask_i64gather_epi64(`
	`253`	`+ src, (const long long int *)base, index, mask, scale);`
`261`	`254`	`}`
`262`	`255`	`static reg_t i64gather(type_t arr, arrsize_t ind)`
`263`	`256`	`{`
`264`		`- return set(arr[ind[3]],`
`265`		`- arr[ind[2]],`
`266`		`- arr[ind[1]],`
`267`		`- arr[ind[0]]);`
	`257`	`+ return set(arr[ind[3]], arr[ind[2]], arr[ind[1]], arr[ind[0]]);`
`268`	`258`	`}`
`269`	`259`	`static opmask_t gt(reg_t x, reg_t y)`
`270`	`260`	`{`
`@@ -427,10 +417,7 @@ struct avx2_vector<double> {`
`427`	`417`	`{`
`428`	`418`	`return _mm256_set_epi64x(v1, v2, v3, v4);`
`429`	`419`	`}`
`430`		`- static reg_t set(type_t v1,`
`431`		`- type_t v2,`
`432`		`- type_t v3,`
`433`		`- type_t v4)`
	`420`	`+ static reg_t set(type_t v1, type_t v2, type_t v3, type_t v4)`
`434`	`421`	`{`
`435`	`422`	`return _mm256_set_pd(v1, v2, v3, v4);`
`436`	`423`	`}`
`@@ -450,16 +437,16 @@ struct avx2_vector<double> {`
`450`	`437`	`static reg_t`
`451`	`438`	`mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base)`
`452`	`439`	`{`
`453`		`- return _mm256_mask_i64gather_pd(`
`454`		`- src, (const type_t *) base, index, _mm256_castsi256_pd(mask), scale);`
	`440`	`+ return _mm256_mask_i64gather_pd(src,`
	`441`	`+ (const type_t *)base,`
	`442`	`+ index,`
	`443`	`+ _mm256_castsi256_pd(mask),`
	`444`	`+ scale);`
`455`	`445`	`;`
`456`	`446`	`}`
`457`	`447`	`static reg_t i64gather(type_t arr, arrsize_t ind)`
`458`	`448`	`{`
`459`		`- return set(arr[ind[3]],`
`460`		`- arr[ind[2]],`
`461`		`- arr[ind[1]],`
`462`		`- arr[ind[0]]);`
	`449`	`+ return set(arr[ind[3]], arr[ind[2]], arr[ind[1]], arr[ind[0]]);`
`463`	`450`	`}`
`464`	`451`	`static reg_t loadu(void const *mem)`
`465`	`452`	`{`