@@ -411,15 +411,13 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr,
411411
412412    for  (int  i = 0 ; i < vecsToPartition; i++) {
413413        reg_t  curr_vec = vtype::loadu (arr + left + i * vtype::numlanes);
414-         typename  vtype::opmask_t  ge_mask = vtype::ge (curr_vec, pivot_vec);
415-         int32_t  amount_ge_pivot = _mm_popcnt_u64 ((int64_t )ge_mask);
416-         vtype::mask_compressstoreu (
417-                 arr + leftStore, vtype::knot_opmask (ge_mask), curr_vec);
418- 
419-         vtype::mask_compressstoreu (buffer + bufferStored, ge_mask, curr_vec);
420- 
421-         min_vec = vtype::min (curr_vec, min_vec);
422-         max_vec = vtype::max (curr_vec, max_vec);
414+         
415+         int32_t  amount_ge_pivot = partition_vec<vtype>(arr + leftStore,
416+                              buffer + num_unroll * vtype::numlanes - bufferStored - vtype::numlanes,
417+                              curr_vec,
418+                              pivot_vec,
419+                              min_vec,
420+                              max_vec);
423421
424422        bufferStored += amount_ge_pivot;
425423        leftStore += vtype::numlanes - amount_ge_pivot;
@@ -435,7 +433,7 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr,
435433                arr + right - bufferStored,
436434                bufferStored * sizeof (type_t ));
437435    std::memcpy (
438-             arr + right - bufferStored, buffer, bufferStored * sizeof (type_t ));
436+             arr + right - bufferStored, buffer + num_unroll * vtype::numlanes - bufferStored , bufferStored * sizeof (type_t ));
439437
440438    //  The change to left depends only on numVecs, since we store the data replaced by the buffer on the left side
441439    left += vecsToPartition * vtype::numlanes - bufferStored;
0 commit comments