Skip to content

Commit 6fb8f01

Browse files
committed
Changed types in many places, removed unused bitonic sort logic
1 parent 6594398 commit 6fb8f01

File tree

2 files changed

+23
-49
lines changed

2 files changed

+23
-49
lines changed

src/avx512-common-qsort.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ X86_SIMD_SORT_INLINE void partition_vec(type_t *arr,
247247
reg_t &biggest_vec)
248248
{
249249
typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec);
250-
uint64_t amount_ge_pivot = _mm_popcnt_u64(ge_mask);
250+
arrsize_t amount_ge_pivot = _mm_popcnt_u64(ge_mask);
251251
vtype::mask_compressstoreu(
252252
arr + left, vtype::knot_opmask(ge_mask), curr_vec);
253253

@@ -294,8 +294,8 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t *arr,
294294

295295
if (right - left == vtype::numlanes) {
296296
reg_t vec = vtype::loadu(arr + left);
297-
uint64_t unpartitioned = right - left - vtype::numlanes;
298-
uint64_t l_store = left;
297+
arrsize_t unpartitioned = right - left - vtype::numlanes;
298+
arrsize_t l_store = left;
299299

300300
partition_vec<vtype>(
301301
arr, l_store, unpartitioned, vec, pivot_vec, min_vec, max_vec);
@@ -389,12 +389,12 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr,
389389
reg_t min_vec = vtype::set1(*smallest);
390390
reg_t max_vec = vtype::set1(*biggest);
391391

392-
int64_t vecsToPartition = ((right - left) / vtype::numlanes) % num_unroll;
392+
int vecsToPartition = ((right - left) / vtype::numlanes) % num_unroll;
393393
type_t buffer[num_unroll * vtype::numlanes];
394394
int32_t bufferStored = 0;
395-
int64_t leftStore = left;
395+
arrsize_t leftStore = left;
396396

397-
for (int32_t i = 0; i < vecsToPartition; i++) {
397+
for (int i = 0; i < vecsToPartition; i++) {
398398
reg_t curr_vec = vtype::loadu(arr + left + i * vtype::numlanes);
399399
typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec);
400400
int32_t amount_ge_pivot = _mm_popcnt_u64((int64_t)ge_mask);
@@ -853,8 +853,8 @@ void sort_n(typename vtype::type_t *arr, int N);
853853

854854
template <typename vtype, typename type_t>
855855
X86_SIMD_SORT_INLINE type_t get_pivot_blocks(type_t *arr,
856-
uint64_t left,
857-
uint64_t right);
856+
arrsize_t left,
857+
arrsize_t right);
858858

859859
template <typename vtype, typename type_t>
860860
static void

src/xss-network-qsort.hpp

Lines changed: 15 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -5,24 +5,7 @@
55
#include "xss-optimal-networks.hpp"
66

77
template <typename vtype,
8-
int64_t numVecs,
9-
typename reg_t = typename vtype::reg_t>
10-
X86_SIMD_SORT_FINLINE void bitonic_clean_n_vec(reg_t *regs)
11-
{
12-
X86_SIMD_SORT_UNROLL_LOOP(512)
13-
for (int num = numVecs / 2; num >= 2; num /= 2) {
14-
X86_SIMD_SORT_UNROLL_LOOP(512)
15-
for (int j = 0; j < numVecs; j += num) {
16-
X86_SIMD_SORT_UNROLL_LOOP(512)
17-
for (int i = 0; i < num / 2; i++) {
18-
COEX<vtype>(regs[i + j], regs[i + j + num / 2]);
19-
}
20-
}
21-
}
22-
}
23-
24-
template <typename vtype,
25-
int64_t numVecs,
8+
int numVecs,
269
typename reg_t = typename vtype::reg_t>
2710
X86_SIMD_SORT_FINLINE void bitonic_sort_n_vec(reg_t *regs)
2811
{
@@ -46,20 +29,11 @@ X86_SIMD_SORT_FINLINE void bitonic_sort_n_vec(reg_t *regs)
4629
optimal_sort_32<vtype>(regs);
4730
}
4831
else {
49-
// TODO should we remove this branch? I believe it is never used in the current code
50-
bitonic_sort_n_vec<vtype, numVecs / 2>(regs);
51-
bitonic_sort_n_vec<vtype, numVecs / 2>(regs + numVecs / 2);
52-
53-
X86_SIMD_SORT_UNROLL_LOOP(64)
54-
for (int i = 0; i < numVecs / 2; i++) {
55-
COEX<vtype>(regs[i], regs[numVecs - 1 - i]);
56-
}
57-
58-
bitonic_clean_n_vec<vtype, numVecs>(regs);
32+
static_assert(numVecs == -1, "should not reach here");
5933
}
6034
}
6135

62-
template <typename vtype, int64_t numVecs, int64_t scale, bool first = true>
36+
template <typename vtype, int numVecs, int scale, bool first = true>
6337
X86_SIMD_SORT_FINLINE void internal_merge_n_vec(typename vtype::reg_t *reg)
6438
{
6539
using reg_t = typename vtype::reg_t;
@@ -94,8 +68,8 @@ X86_SIMD_SORT_FINLINE void internal_merge_n_vec(typename vtype::reg_t *reg)
9468
}
9569

9670
template <typename vtype,
97-
int64_t numVecs,
98-
int64_t scale,
71+
int numVecs,
72+
int scale,
9973
typename reg_t = typename vtype::reg_t>
10074
X86_SIMD_SORT_FINLINE void merge_substep_n_vec(reg_t *regs)
10175
{
@@ -121,8 +95,8 @@ X86_SIMD_SORT_FINLINE void merge_substep_n_vec(reg_t *regs)
12195
}
12296

12397
template <typename vtype,
124-
int64_t numVecs,
125-
int64_t scale,
98+
int numVecs,
99+
int scale,
126100
typename reg_t = typename vtype::reg_t>
127101
X86_SIMD_SORT_FINLINE void merge_step_n_vec(reg_t *regs)
128102
{
@@ -134,8 +108,8 @@ X86_SIMD_SORT_FINLINE void merge_step_n_vec(reg_t *regs)
134108
}
135109

136110
template <typename vtype,
137-
int64_t numVecs,
138-
int64_t numPer = 2,
111+
int numVecs,
112+
int numPer = 2,
139113
typename reg_t = typename vtype::reg_t>
140114
X86_SIMD_SORT_FINLINE void merge_n_vec(reg_t *regs)
141115
{
@@ -216,22 +190,22 @@ X86_SIMD_SORT_INLINE void sort_n(typename vtype::type_t *arr, int N)
216190

217191
template <typename vtype, typename type_t>
218192
X86_SIMD_SORT_INLINE type_t get_pivot(type_t *arr,
219-
const int64_t left,
220-
const int64_t right);
193+
const arrsize_t left,
194+
const arrsize_t right);
221195

222196
template <typename vtype, typename type_t>
223197
X86_SIMD_SORT_INLINE type_t get_pivot_blocks(type_t *arr,
224-
uint64_t left,
225-
uint64_t right)
198+
arrsize_t left,
199+
arrsize_t right)
226200
{
227201

228202
if (right - left <= 1024) { return get_pivot<vtype>(arr, left, right); }
229203

230204
using reg_t = typename vtype::reg_t;
231205
constexpr int numVecs = 5;
232206

233-
uint64_t width = (right - vtype::numlanes) - left;
234-
uint64_t delta = width / numVecs;
207+
arrsize_t width = (right - vtype::numlanes) - left;
208+
arrsize_t delta = width / numVecs;
235209

236210
reg_t vecs[numVecs];
237211
// Load data

0 commit comments

Comments
 (0)