@@ -380,12 +380,8 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr,
380380                arr, left, right, pivot, smallest, biggest);
381381    }
382382
383-     if  (right - left <= 2  * num_unroll * vtype::numlanes) {
384-         return  partition_avx512<vtype>(
385-                 arr, left, right, pivot, smallest, biggest);
386-     }
387-     /*  make array length divisible by 8*vtype::numlanes , shortening the array */ 
388-     for  (int32_t  i = ((right - left) % (num_unroll * vtype::numlanes)); i > 0 ;
383+     /*  make array length divisible by vtype::numlanes , shortening the array */ 
384+     for  (int32_t  i = ((right - left) % (vtype::numlanes)); i > 0 ;
389385         --i) {
390386        *smallest = std::min (*smallest, arr[left], comparison_func<vtype>);
391387        *biggest = std::max (*biggest, arr[left], comparison_func<vtype>);
@@ -396,14 +392,49 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr,
396392            ++left;
397393        }
398394    }
399- 
395+      
400396    if  (left == right)
401397        return  left; /*  less than vtype::numlanes elements in the array */ 
402- 
398+      
403399    using  reg_t  = typename  vtype::reg_t ;
404400    reg_t  pivot_vec = vtype::set1 (pivot);
405401    reg_t  min_vec = vtype::set1 (*smallest);
406402    reg_t  max_vec = vtype::set1 (*biggest);
403+     
404+     int64_t  vecsToPartition = ((right - left) / vtype::numlanes) % num_unroll;
405+     type_t  buffer[num_unroll * vtype::numlanes];
406+     int32_t  bufferStored = 0 ;
407+     int64_t  leftStore = left;
408+     
409+     for  (int32_t  i = 0 ; i < vecsToPartition; i++){
410+         reg_t  curr_vec = vtype::loadu (arr + left + i * vtype::numlanes);
411+         typename  vtype::opmask_t  ge_mask = vtype::ge (curr_vec, pivot_vec);
412+         int32_t  amount_ge_pivot = _mm_popcnt_u64 ((int64_t )ge_mask);
413+         vtype::mask_compressstoreu (
414+                 arr + leftStore, vtype::knot_opmask (ge_mask), curr_vec);
415+                 
416+         vtype::mask_compressstoreu (
417+                 buffer + bufferStored, ge_mask, curr_vec);
418+                 
419+         min_vec = vtype::min (curr_vec, min_vec);
420+         max_vec = vtype::max (curr_vec, max_vec);
421+         
422+         bufferStored += amount_ge_pivot;
423+         leftStore += vtype::numlanes - amount_ge_pivot;
424+     }
425+     
426+     //  We can't just store the buffer on the right, since this would override data that has no copies elsewhere
427+     //  Instead, copy the data that is currently on the right, and store it on the left side in the space between leftStore and left
428+     //  Then we copy the buffer onto the right side
429+     std::memcpy (arr + leftStore, arr + right - bufferStored, bufferStored * sizeof (type_t ));
430+     std::memcpy (arr + right - bufferStored, buffer, bufferStored * sizeof (type_t ));
431+     
432+     //  The change to left depends only on numVecs, since we store the data replaced by the buffer on the left side
433+     left += vecsToPartition * vtype::numlanes - bufferStored;
434+     right -= bufferStored;
435+ 
436+     if  (left == right)
437+         return  left; /*  less than vtype::numlanes elements in the array */ 
407438
408439    //  We will now have atleast 16 registers worth of data to process:
409440    //  left and right vtype::numlanes values are partitioned at the end
0 commit comments