Skip to content

Commit 4ff3e1d

Browse files
committed
Changed to using new unroll macro
1 parent f028325 commit 4ff3e1d

File tree

1 file changed

+14
-12
lines changed

1 file changed

+14
-12
lines changed

src/xss-network-qsort.hpp

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,18 @@
11
#ifndef XSS_NETWORK_QSORT
22
#define XSS_NETWORK_QSORT
33

4+
#include "avx512-common-qsort.h"
5+
46
template <typename vtype,
57
int64_t numVecs,
68
typename reg_t = typename vtype::reg_t>
79
X86_SIMD_SORT_INLINE void bitonic_clean_n_vec(reg_t *regs)
810
{
9-
#pragma GCC unroll 64
11+
X86_SIMD_SORT_UNROLL_LOOP(64)
1012
for (int num = numVecs / 2; num >= 2; num /= 2) {
11-
#pragma GCC unroll 64
13+
X86_SIMD_SORT_UNROLL_LOOP(64)
1214
for (int j = 0; j < numVecs; j += num) {
13-
#pragma GCC unroll 64
15+
X86_SIMD_SORT_UNROLL_LOOP(64)
1416
for (int i = 0; i < num / 2; i++) {
1517
COEX<vtype>(regs[i + j], regs[i + j + num / 2]);
1618
}
@@ -30,7 +32,7 @@ X86_SIMD_SORT_INLINE void bitonic_merge_n_vec(reg_t *regs)
3032
}
3133
else if constexpr (numVecs > 2) {
3234
// Reverse upper half
33-
#pragma GCC unroll 64
35+
X86_SIMD_SORT_UNROLL_LOOP(64)
3436
for (int i = 0; i < numVecs / 2; i++) {
3537
reg_t rev = vtype::reverse(regs[numVecs - i - 1]);
3638
reg_t maxV = vtype::max(regs[i], rev);
@@ -44,7 +46,7 @@ X86_SIMD_SORT_INLINE void bitonic_merge_n_vec(reg_t *regs)
4446
bitonic_clean_n_vec<vtype, numVecs>(regs);
4547

4648
// Now do bitonic_merge
47-
#pragma GCC unroll 64
49+
X86_SIMD_SORT_UNROLL_LOOP(64)
4850
for (int i = 0; i < numVecs; i++) {
4951
regs[i] = vtype::bitonic_merge(regs[i]);
5052
}
@@ -59,7 +61,7 @@ X86_SIMD_SORT_INLINE void bitonic_fullmerge_n_vec(reg_t *regs)
5961
if constexpr (numPer > numVecs)
6062
return;
6163
else {
62-
#pragma GCC unroll 64
64+
X86_SIMD_SORT_UNROLL_LOOP(64)
6365
for (int i = 0; i < numVecs / numPer; i++) {
6466
bitonic_merge_n_vec<vtype, numPer>(regs + i * numPer);
6567
}
@@ -79,7 +81,7 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int32_t N)
7981

8082
// Generate masks for loading and storing
8183
typename vtype::opmask_t ioMasks[numVecs - numVecs / 2];
82-
#pragma GCC unroll 64
84+
X86_SIMD_SORT_UNROLL_LOOP(64)
8385
for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) {
8486
int64_t num_to_read
8587
= std::min((int64_t)std::max(0, N - i * vtype::numlanes),
@@ -88,19 +90,19 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int32_t N)
8890
}
8991

9092
// Unmasked part of the load
91-
#pragma GCC unroll 64
93+
X86_SIMD_SORT_UNROLL_LOOP(64)
9294
for (int i = 0; i < numVecs / 2; i++) {
9395
vecs[i] = vtype::loadu(arr + i * vtype::numlanes);
9496
}
9597
// Masked part of the load
96-
#pragma GCC unroll 64
98+
X86_SIMD_SORT_UNROLL_LOOP(64)
9799
for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) {
98100
vecs[i] = vtype::mask_loadu(
99101
vtype::zmm_max(), ioMasks[j], arr + i * vtype::numlanes);
100102
}
101103

102104
// Sort each loaded vector
103-
#pragma GCC unroll 64
105+
X86_SIMD_SORT_UNROLL_LOOP(64)
104106
for (int i = 0; i < numVecs; i++) {
105107
vecs[i] = vtype::sort_vec(vecs[i]);
106108
}
@@ -109,12 +111,12 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int32_t N)
109111
bitonic_fullmerge_n_vec<vtype, numVecs>(&vecs[0]);
110112

111113
// Unmasked part of the store
112-
#pragma GCC unroll 64
114+
X86_SIMD_SORT_UNROLL_LOOP(64)
113115
for (int i = 0; i < numVecs / 2; i++) {
114116
vtype::storeu(arr + i * vtype::numlanes, vecs[i]);
115117
}
116118
// Masked part of the store
117-
#pragma GCC unroll 64
119+
X86_SIMD_SORT_UNROLL_LOOP(64)
118120
for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) {
119121
vtype::mask_storeu(arr + i * vtype::numlanes, ioMasks[j], vecs[i]);
120122
}

0 commit comments

Comments
 (0)