Skip to content

Commit d617059

Browse files
committed
Changed how partition code shortens the array before the main loop
1 parent b52e889 commit d617059

File tree

1 file changed

+39
-8
lines changed

1 file changed

+39
-8
lines changed

src/avx512-common-qsort.h

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -380,12 +380,8 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr,
380380
arr, left, right, pivot, smallest, biggest);
381381
}
382382

383-
if (right - left <= 2 * num_unroll * vtype::numlanes) {
384-
return partition_avx512<vtype>(
385-
arr, left, right, pivot, smallest, biggest);
386-
}
387-
/* make array length divisible by 8*vtype::numlanes , shortening the array */
388-
for (int32_t i = ((right - left) % (num_unroll * vtype::numlanes)); i > 0;
383+
/* make array length divisible by vtype::numlanes , shortening the array */
384+
for (int32_t i = ((right - left) % (vtype::numlanes)); i > 0;
389385
--i) {
390386
*smallest = std::min(*smallest, arr[left], comparison_func<vtype>);
391387
*biggest = std::max(*biggest, arr[left], comparison_func<vtype>);
@@ -396,14 +392,49 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr,
396392
++left;
397393
}
398394
}
399-
395+
400396
if (left == right)
401397
return left; /* less than vtype::numlanes elements in the array */
402-
398+
403399
using reg_t = typename vtype::reg_t;
404400
reg_t pivot_vec = vtype::set1(pivot);
405401
reg_t min_vec = vtype::set1(*smallest);
406402
reg_t max_vec = vtype::set1(*biggest);
403+
404+
int64_t vecsToPartition = ((right - left) / vtype::numlanes) % num_unroll;
405+
type_t buffer[num_unroll * vtype::numlanes];
406+
int32_t bufferStored = 0;
407+
int64_t leftStore = left;
408+
409+
for (int32_t i = 0; i < vecsToPartition; i++){
410+
reg_t curr_vec = vtype::loadu(arr + left + i * vtype::numlanes);
411+
typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec);
412+
int32_t amount_ge_pivot = _mm_popcnt_u64((int64_t)ge_mask);
413+
vtype::mask_compressstoreu(
414+
arr + leftStore, vtype::knot_opmask(ge_mask), curr_vec);
415+
416+
vtype::mask_compressstoreu(
417+
buffer + bufferStored, ge_mask, curr_vec);
418+
419+
min_vec = vtype::min(curr_vec, min_vec);
420+
max_vec = vtype::max(curr_vec, max_vec);
421+
422+
bufferStored += amount_ge_pivot;
423+
leftStore += vtype::numlanes - amount_ge_pivot;
424+
}
425+
426+
// We can't just store the buffer on the right, since this would override data that has no copies elsewhere
427+
// Instead, copy the data that is currently on the right, and store it on the left side in the space between leftStore and left
428+
// Then we copy the buffer onto the right side
429+
std::memcpy(arr + leftStore, arr + right - bufferStored, bufferStored * sizeof(type_t));
430+
std::memcpy(arr + right - bufferStored, buffer, bufferStored * sizeof(type_t));
431+
432+
// The change to left depends only on numVecs, since we store the data replaced by the buffer on the left side
433+
left += vecsToPartition * vtype::numlanes - bufferStored;
434+
right -= bufferStored;
435+
436+
if (left == right)
437+
return left; /* less than vtype::numlanes elements in the array */
407438

408439
// We will now have atleast 16 registers worth of data to process:
409440
// left and right vtype::numlanes values are partitioned at the end

0 commit comments

Comments
 (0)