@@ -108,7 +108,7 @@ X86_SIMD_SORT_INLINE void argsort_32_64bit(type_t *arr, int64_t *arg, int32_t N)
108108    zmm_t  arrzmm[4 ];
109109    argzmm_t  argzmm[4 ];
110110
111- # pragma   X86_SIMD_SORT_UNROLL_LOOP(2)
111+ X86_SIMD_SORT_UNROLL_LOOP (2 )
112112    for  (int  ii = 0 ; ii < 2 ; ++ii) {
113113        argzmm[ii] = argtype::loadu (arg + 8  * ii);
114114        arrzmm[ii] = vtype::template  i64gather<sizeof (type_t )>(argzmm[ii], arr);
@@ -117,7 +117,7 @@ X86_SIMD_SORT_INLINE void argsort_32_64bit(type_t *arr, int64_t *arg, int32_t N)
117117
118118    uint64_t  combined_mask = (0x1ull  << (N - 16 )) - 0x1ull ;
119119    opmask_t  load_mask[2 ] = {0xFF , 0xFF };
120- # pragma   X86_SIMD_SORT_UNROLL_LOOP(2)
120+ X86_SIMD_SORT_UNROLL_LOOP (2 )
121121    for  (int  ii = 0 ; ii < 2 ; ++ii) {
122122        load_mask[ii] = (combined_mask >> (ii * 8 )) & 0xFF ;
123123        argzmm[ii + 2 ] = argtype::maskz_loadu (load_mask[ii], arg + 16  + 8  * ii);
@@ -151,7 +151,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
151151    zmm_t  arrzmm[8 ];
152152    argzmm_t  argzmm[8 ];
153153
154- # pragma   X86_SIMD_SORT_UNROLL_LOOP(4)
154+ X86_SIMD_SORT_UNROLL_LOOP (4 )
155155    for  (int  ii = 0 ; ii < 4 ; ++ii) {
156156        argzmm[ii] = argtype::loadu (arg + 8  * ii);
157157        arrzmm[ii] = vtype::template  i64gather<sizeof (type_t )>(argzmm[ii], arr);
@@ -160,7 +160,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
160160
161161    opmask_t  load_mask[4 ] = {0xFF , 0xFF , 0xFF , 0xFF };
162162    uint64_t  combined_mask = (0x1ull  << (N - 32 )) - 0x1ull ;
163- # pragma   X86_SIMD_SORT_UNROLL_LOOP(4)
163+ X86_SIMD_SORT_UNROLL_LOOP (4 )
164164    for  (int  ii = 0 ; ii < 4 ; ++ii) {
165165        load_mask[ii] = (combined_mask >> (ii * 8 )) & 0xFF ;
166166        argzmm[ii + 4 ] = argtype::maskz_loadu (load_mask[ii], arg + 32  + 8  * ii);
@@ -170,7 +170,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
170170                                                        argzmm[ii + 4 ]);
171171    }
172172
173- # pragma   X86_SIMD_SORT_UNROLL_LOOP(4)
173+ X86_SIMD_SORT_UNROLL_LOOP (4 )
174174    for  (int  ii = 0 ; ii < 8 ; ii = ii + 2 ) {
175175        bitonic_merge_two_zmm_64bit<vtype, argtype>(
176176                arrzmm[ii], arrzmm[ii + 1 ], argzmm[ii], argzmm[ii + 1 ]);
@@ -179,11 +179,11 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
179179    bitonic_merge_four_zmm_64bit<vtype, argtype>(arrzmm + 4 , argzmm + 4 );
180180    bitonic_merge_eight_zmm_64bit<vtype, argtype>(arrzmm, argzmm);
181181
182- # pragma   X86_SIMD_SORT_UNROLL_LOOP(4)
182+ X86_SIMD_SORT_UNROLL_LOOP (4 )
183183    for  (int  ii = 0 ; ii < 4 ; ++ii) {
184184        argtype::storeu (arg + 8  * ii, argzmm[ii]);
185185    }
186- # pragma   X86_SIMD_SORT_UNROLL_LOOP(4)
186+ X86_SIMD_SORT_UNROLL_LOOP (4 )
187187    for  (int  ii = 0 ; ii < 4 ; ++ii) {
188188        argtype::mask_storeu (arg + 32  + 8  * ii, load_mask[ii], argzmm[ii + 4 ]);
189189    }
@@ -203,7 +203,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
203203//     zmm_t arrzmm[16];
204204//     argzmm_t argzmm[16];
205205// 
206- // #pragma  X86_SIMD_SORT_UNROLL_LOOP(8)
206+ // X86_SIMD_SORT_UNROLL_LOOP(8)
207207//     for (int ii = 0; ii < 8; ++ii) {
208208//         argzmm[ii] = argtype::loadu(arg + 8*ii);
209209//         arrzmm[ii] = vtype::template i64gather<sizeof(type_t)>(argzmm[ii], arr);
@@ -213,19 +213,19 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
213213//     opmask_t load_mask[8] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
214214//     if (N != 128) {
215215//     uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull;
216- // #pragma  X86_SIMD_SORT_UNROLL_LOOP(8)
216+ // X86_SIMD_SORT_UNROLL_LOOP(8)
217217//         for (int ii = 0; ii < 8; ++ii) {
218218//             load_mask[ii] = (combined_mask >> (ii*8)) & 0xFF;
219219//         }
220220//     }
221- // #pragma  X86_SIMD_SORT_UNROLL_LOOP(8)
221+ // X86_SIMD_SORT_UNROLL_LOOP(8)
222222//     for (int ii = 0; ii < 8; ++ii) {
223223//         argzmm[ii+8] = argtype::maskz_loadu(load_mask[ii], arg + 64 + 8*ii);
224224//         arrzmm[ii+8] = vtype::template mask_i64gather<sizeof(type_t)>(vtype::zmm_max(), load_mask[ii], argzmm[ii+8], arr);
225225//         arrzmm[ii+8] = sort_zmm_64bit<vtype, argtype>(arrzmm[ii+8], argzmm[ii+8]);
226226//     }
227227// 
228- // #pragma  X86_SIMD_SORT_UNROLL_LOOP(8)
228+ // X86_SIMD_SORT_UNROLL_LOOP(8)
229229//     for (int ii = 0; ii < 16; ii = ii + 2) {
230230//         bitonic_merge_two_zmm_64bit<vtype, argtype>(arrzmm[ii], arrzmm[ii + 1], argzmm[ii], argzmm[ii + 1]);
231231//     }
@@ -237,11 +237,11 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
237237//     bitonic_merge_eight_zmm_64bit<vtype, argtype>(arrzmm+8, argzmm+8);
238238//     bitonic_merge_sixteen_zmm_64bit<vtype, argtype>(arrzmm, argzmm);
239239// 
240- // #pragma  X86_SIMD_SORT_UNROLL_LOOP(8)
240+ // X86_SIMD_SORT_UNROLL_LOOP(8)
241241//     for (int ii = 0; ii < 8; ++ii) {
242242//         argtype::storeu(arg + 8*ii, argzmm[ii]);
243243//     }
244- // #pragma  X86_SIMD_SORT_UNROLL_LOOP(8)
244+ // X86_SIMD_SORT_UNROLL_LOOP(8)
245245//     for (int ii = 0; ii < 8; ++ii) {
246246//         argtype::mask_storeu(arg + 64 + 8*ii, load_mask[ii], argzmm[ii + 8]);
247247//     }
0 commit comments