11#ifndef XSS_NETWORK_QSORT
22#define XSS_NETWORK_QSORT
33
4+ #include " avx512-common-qsort.h"
5+
46template <typename vtype,
57 int64_t numVecs,
68 typename reg_t = typename vtype::reg_t >
79X86_SIMD_SORT_INLINE void bitonic_clean_n_vec (reg_t *regs)
810{
9- # pragma GCC unroll 64
11+ X86_SIMD_SORT_UNROLL_LOOP ( 64 )
1012 for (int num = numVecs / 2 ; num >= 2 ; num /= 2 ) {
11- # pragma GCC unroll 64
13+ X86_SIMD_SORT_UNROLL_LOOP ( 64 )
1214 for (int j = 0 ; j < numVecs; j += num) {
13- # pragma GCC unroll 64
15+ X86_SIMD_SORT_UNROLL_LOOP ( 64 )
1416 for (int i = 0 ; i < num / 2 ; i++) {
1517 COEX<vtype>(regs[i + j], regs[i + j + num / 2 ]);
1618 }
@@ -30,7 +32,7 @@ X86_SIMD_SORT_INLINE void bitonic_merge_n_vec(reg_t *regs)
3032 }
3133 else if constexpr (numVecs > 2 ) {
3234// Reverse upper half
33- # pragma GCC unroll 64
35+ X86_SIMD_SORT_UNROLL_LOOP ( 64 )
3436 for (int i = 0 ; i < numVecs / 2 ; i++) {
3537 reg_t rev = vtype::reverse (regs[numVecs - i - 1 ]);
3638 reg_t maxV = vtype::max (regs[i], rev);
@@ -44,7 +46,7 @@ X86_SIMD_SORT_INLINE void bitonic_merge_n_vec(reg_t *regs)
4446 bitonic_clean_n_vec<vtype, numVecs>(regs);
4547
4648// Now do bitonic_merge
47- # pragma GCC unroll 64
49+ X86_SIMD_SORT_UNROLL_LOOP ( 64 )
4850 for (int i = 0 ; i < numVecs; i++) {
4951 regs[i] = vtype::bitonic_merge (regs[i]);
5052 }
@@ -59,7 +61,7 @@ X86_SIMD_SORT_INLINE void bitonic_fullmerge_n_vec(reg_t *regs)
5961 if constexpr (numPer > numVecs)
6062 return ;
6163 else {
62- # pragma GCC unroll 64
64+ X86_SIMD_SORT_UNROLL_LOOP ( 64 )
6365 for (int i = 0 ; i < numVecs / numPer; i++) {
6466 bitonic_merge_n_vec<vtype, numPer>(regs + i * numPer);
6567 }
@@ -79,7 +81,7 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int32_t N)
7981
8082 // Generate masks for loading and storing
8183 typename vtype::opmask_t ioMasks[numVecs - numVecs / 2 ];
82- # pragma GCC unroll 64
84+ X86_SIMD_SORT_UNROLL_LOOP ( 64 )
8385 for (int i = numVecs / 2 , j = 0 ; i < numVecs; i++, j++) {
8486 int64_t num_to_read
8587 = std::min ((int64_t )std::max (0 , N - i * vtype::numlanes),
@@ -88,19 +90,19 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int32_t N)
8890 }
8991
9092// Unmasked part of the load
91- # pragma GCC unroll 64
93+ X86_SIMD_SORT_UNROLL_LOOP ( 64 )
9294 for (int i = 0 ; i < numVecs / 2 ; i++) {
9395 vecs[i] = vtype::loadu (arr + i * vtype::numlanes);
9496 }
9597// Masked part of the load
96- # pragma GCC unroll 64
98+ X86_SIMD_SORT_UNROLL_LOOP ( 64 )
9799 for (int i = numVecs / 2 , j = 0 ; i < numVecs; i++, j++) {
98100 vecs[i] = vtype::mask_loadu (
99101 vtype::zmm_max (), ioMasks[j], arr + i * vtype::numlanes);
100102 }
101103
102104// Sort each loaded vector
103- # pragma GCC unroll 64
105+ X86_SIMD_SORT_UNROLL_LOOP ( 64 )
104106 for (int i = 0 ; i < numVecs; i++) {
105107 vecs[i] = vtype::sort_vec (vecs[i]);
106108 }
@@ -109,12 +111,12 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int32_t N)
109111 bitonic_fullmerge_n_vec<vtype, numVecs>(&vecs[0 ]);
110112
111113// Unmasked part of the store
112- # pragma GCC unroll 64
114+ X86_SIMD_SORT_UNROLL_LOOP ( 64 )
113115 for (int i = 0 ; i < numVecs / 2 ; i++) {
114116 vtype::storeu (arr + i * vtype::numlanes, vecs[i]);
115117 }
116118// Masked part of the store
117- # pragma GCC unroll 64
119+ X86_SIMD_SORT_UNROLL_LOOP ( 64 )
118120 for (int i = numVecs / 2 , j = 0 ; i < numVecs; i++, j++) {
119121 vtype::mask_storeu (arr + i * vtype::numlanes, ioMasks[j], vecs[i]);
120122 }
0 commit comments