12
12
13
13
namespace cppfastbox
14
14
{
15
+ // 向量builtin和intrinsic中使用的int8_t
15
16
using simd_int8_t = ::std::conditional_t <::cppfastbox::is_cpu_arch<::cppfastbox::cpu_arch::x86>, char , int8_t >;
17
+ // 向量builtin和intrinsic中使用的int16_t
16
18
using simd_int16_t = ::std::conditional_t <::cppfastbox::is_cpu_arch<::cppfastbox::cpu_arch::x86>, short , int16_t >;
19
+ // 向量builtin和intrinsic中使用的int32_t
17
20
using simd_int32_t = ::std::conditional_t <::cppfastbox::is_cpu_arch<::cppfastbox::cpu_arch::x86>, int , int32_t >;
21
+ // 向量builtin和intrinsic中使用的int64_t
18
22
using simd_int64_t = ::std::conditional_t <::cppfastbox::is_cpu_arch<::cppfastbox::cpu_arch::x86>, char , int64_t >;
23
+ // 向量builtin和intrinsic中使用的uint8_t
19
24
using simd_uint8_t = ::std::make_unsigned_t <::cppfastbox::simd_int8_t >;
25
+ // 向量builtin和intrinsic中使用的uint16_t
20
26
using simd_uint16_t = ::std::make_unsigned_t <::cppfastbox::simd_int16_t >;
27
+ // 向量builtin和intrinsic中使用的uint32_t
21
28
using simd_uint32_t = ::std::make_unsigned_t <::cppfastbox::simd_int32_t >;
29
+ // 向量builtin和intrinsic中使用的uint64_t
22
30
using simd_uint64_t = ::std::make_unsigned_t <::cppfastbox::simd_int64_t >;
23
31
} // namespace cppfastbox
24
32
@@ -390,7 +398,7 @@ namespace cppfastbox
390
398
*
391
399
* @tparam lane_max_size 分解时最大读写通道的大小
392
400
* @param size 要分解的读写操作的字节数
393
- * @note 分解时会尝试向量化,使用的向量大小等同于`lane_max_size`
401
+ * @note 分解时会尝试向量化
394
402
*/
395
403
template <::std::size_t lane_max_size = ::cppfastbox::native_ls_lane_max_size>
396
404
constexpr inline auto split_into_native_ls_lanes (::std::size_t size) noexcept
@@ -417,8 +425,8 @@ namespace cppfastbox
417
425
{
418
426
using v [[gnu::vector_size (64 )]] = ::std::size_t ;
419
427
v vsize{size, size, size, size, size, size, size, size};
420
- vsize >>= v{0 , 0 , 1 , 2 , 3 , 4 , 5 , 6 };
421
- vsize &= v{0 , 1 , 1 , 1 , 1 , 1 , 1 , -1zu };
428
+ vsize >>= v{6 , 5 , 4 , 3 , 2 , 1 , 0 , 0 };
429
+ vsize &= v{-1zu , 1 , 1 , 1 , 1 , 1 , 1 , 1 };
422
430
__builtin_memcpy (&lanes, &vsize, 64 );
423
431
}
424
432
}
@@ -435,19 +443,39 @@ namespace cppfastbox
435
443
}
436
444
else
437
445
{
438
- using v [[gnu::vector_size (32 )]] = ::std::size_t ;
439
- constexpr v vand{1 , 1 , 1 , 1 };
440
- v vsize1{size, size, size, size};
441
- v vsize2{size, size, size, size};
442
- vsize1 >>= v{3 , 4 , 5 , 64 };
443
- vsize2 >>= v{0 , 0 , 1 , 2 };
444
- vsize1 &= vand;
445
- vsize2 &= vand;
446
- __builtin_memcpy (&lanes, &vsize1, 32 );
447
- __builtin_memcpy (&lanes.l4 , &vsize2, 32 );
446
+ if constexpr (::cppfastbox::cpu_flags::sve_support)
447
+ {
448
+ using v [[gnu::vector_size (64 )]] = ::std::size_t ;
449
+ v vsize{size, size, size, size, size, size, size, size};
450
+ vsize >>= v{64 , 5 , 4 , 3 , 2 , 1 , 0 , 0 };
451
+ vsize &= v{1 , -1zu, 1 , 1 , 1 , 1 , 1 , 1 };
452
+ __builtin_memcpy (&lanes, &vsize, 64 );
453
+ }
454
+ // 由编译器决定是否向量化
455
+ else if constexpr (::cppfastbox::cpu_flags::neon_support)
456
+ {
457
+ lanes.l32 = size >> 5 ;
458
+ lanes.l16 = (size >> 4 ) & 1 ;
459
+ lanes.l8 = (size >> 3 ) & 1 ;
460
+ lanes.l4 = (size >> 2 ) & 1 ;
461
+ lanes.l2 = (size >> 1 ) & 1 ;
462
+ lanes.l1 = size & 1 ;
463
+ }
464
+ else
465
+ {
466
+ using v [[gnu::vector_size (32 )]] = ::std::size_t ;
467
+ v vsize1{size, size, size, size};
468
+ v vsize2{size, size, size, size};
469
+ vsize1 >>= v{64 , 5 , 4 , 3 };
470
+ vsize2 >>= v{2 , 1 , 0 , 0 };
471
+ vsize1 &= v{1 , -1zu, 1 , 1 };
472
+ vsize2 &= v{1 , 1 , 1 , 1 };
473
+ __builtin_memcpy (&lanes, &vsize1, 32 );
474
+ __builtin_memcpy (&lanes.l4 , &vsize2, 32 );
475
+ }
448
476
}
449
477
}
450
- // 向量化效果不理想
478
+ // 由编译器决定是否向量化
451
479
else if constexpr (lane_max_size == 16 )
452
480
{
453
481
lanes.l16 = size >> 4 ;
0 commit comments