Skip to content

Commit 9bf84f9

Browse files
committed
改进split_into_native_ls_lanes向量化
1 parent 36a3767 commit 9bf84f9

File tree

2 files changed

+175
-14
lines changed

2 files changed

+175
-14
lines changed

include/base/utility.h

Lines changed: 42 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,21 @@
1212

1313
namespace cppfastbox
1414
{
15+
// 向量builtin和intrinsic中使用的int8_t
1516
using simd_int8_t = ::std::conditional_t<::cppfastbox::is_cpu_arch<::cppfastbox::cpu_arch::x86>, char, int8_t>;
17+
// 向量builtin和intrinsic中使用的int16_t
1618
using simd_int16_t = ::std::conditional_t<::cppfastbox::is_cpu_arch<::cppfastbox::cpu_arch::x86>, short, int16_t>;
19+
// 向量builtin和intrinsic中使用的int32_t
1720
using simd_int32_t = ::std::conditional_t<::cppfastbox::is_cpu_arch<::cppfastbox::cpu_arch::x86>, int, int32_t>;
21+
// 向量builtin和intrinsic中使用的int64_t
1822
using simd_int64_t = ::std::conditional_t<::cppfastbox::is_cpu_arch<::cppfastbox::cpu_arch::x86>, char, int64_t>;
23+
// 向量builtin和intrinsic中使用的uint8_t
1924
using simd_uint8_t = ::std::make_unsigned_t<::cppfastbox::simd_int8_t>;
25+
// 向量builtin和intrinsic中使用的uint16_t
2026
using simd_uint16_t = ::std::make_unsigned_t<::cppfastbox::simd_int16_t>;
27+
// 向量builtin和intrinsic中使用的uint32_t
2128
using simd_uint32_t = ::std::make_unsigned_t<::cppfastbox::simd_int32_t>;
29+
// 向量builtin和intrinsic中使用的uint64_t
2230
using simd_uint64_t = ::std::make_unsigned_t<::cppfastbox::simd_int64_t>;
2331
} // namespace cppfastbox
2432

@@ -390,7 +398,7 @@ namespace cppfastbox
390398
*
391399
* @tparam lane_max_size 分解时最大读写通道的大小
392400
* @param size 要分解的读写操作的字节数
393-
* @note 分解时会尝试向量化,使用的向量大小等同于`lane_max_size`
401+
* @note 分解时会尝试向量化
394402
*/
395403
template <::std::size_t lane_max_size = ::cppfastbox::native_ls_lane_max_size>
396404
constexpr inline auto split_into_native_ls_lanes(::std::size_t size) noexcept
@@ -417,8 +425,8 @@ namespace cppfastbox
417425
{
418426
using v [[gnu::vector_size(64)]] = ::std::size_t;
419427
v vsize{size, size, size, size, size, size, size, size};
420-
vsize >>= v{0, 0, 1, 2, 3, 4, 5, 6};
421-
vsize &= v{0, 1, 1, 1, 1, 1, 1, -1zu};
428+
vsize >>= v{6, 5, 4, 3, 2, 1, 0, 0};
429+
vsize &= v{-1zu, 1, 1, 1, 1, 1, 1, 1};
422430
__builtin_memcpy(&lanes, &vsize, 64);
423431
}
424432
}
@@ -435,19 +443,39 @@ namespace cppfastbox
435443
}
436444
else
437445
{
438-
using v [[gnu::vector_size(32)]] = ::std::size_t;
439-
constexpr v vand{1, 1, 1, 1};
440-
v vsize1{size, size, size, size};
441-
v vsize2{size, size, size, size};
442-
vsize1 >>= v{3, 4, 5, 64};
443-
vsize2 >>= v{0, 0, 1, 2};
444-
vsize1 &= vand;
445-
vsize2 &= vand;
446-
__builtin_memcpy(&lanes, &vsize1, 32);
447-
__builtin_memcpy(&lanes.l4, &vsize2, 32);
446+
if constexpr(::cppfastbox::cpu_flags::sve_support)
447+
{
448+
using v [[gnu::vector_size(64)]] = ::std::size_t;
449+
v vsize{size, size, size, size, size, size, size, size};
450+
vsize >>= v{64, 5, 4, 3, 2, 1, 0, 0};
451+
vsize &= v{1, -1zu, 1, 1, 1, 1, 1, 1};
452+
__builtin_memcpy(&lanes, &vsize, 64);
453+
}
454+
// 由编译器决定是否向量化
455+
else if constexpr(::cppfastbox::cpu_flags::neon_support)
456+
{
457+
lanes.l32 = size >> 5;
458+
lanes.l16 = (size >> 4) & 1;
459+
lanes.l8 = (size >> 3) & 1;
460+
lanes.l4 = (size >> 2) & 1;
461+
lanes.l2 = (size >> 1) & 1;
462+
lanes.l1 = size & 1;
463+
}
464+
else
465+
{
466+
using v [[gnu::vector_size(32)]] = ::std::size_t;
467+
v vsize1{size, size, size, size};
468+
v vsize2{size, size, size, size};
469+
vsize1 >>= v{64, 5, 4, 3};
470+
vsize2 >>= v{2, 1, 0, 0};
471+
vsize1 &= v{1, -1zu, 1, 1};
472+
vsize2 &= v{1, 1, 1, 1};
473+
__builtin_memcpy(&lanes, &vsize1, 32);
474+
__builtin_memcpy(&lanes.l4, &vsize2, 32);
475+
}
448476
}
449477
}
450-
// 向量化效果不理想
478+
// 由编译器决定是否向量化
451479
else if constexpr(lane_max_size == 16)
452480
{
453481
lanes.l16 = size >> 4;
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
/**
2+
* @file split_into_native_ls_lanes_rt.cpp
3+
* @brief 测试split_into_native_ls_lanes的运行时实现
4+
*
5+
* @copyright Copyright (c) 2024-present Trajectronix Open Source Group
6+
*
7+
*/
8+
#include "../../include/base/utility.h"
9+
#ifdef CPPFASTBOX_HOSTED_TEST
10+
#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
11+
#include <doctest/doctest.h>
12+
#define CPPFASTBOX_ASSERT CHECK
13+
#define CPPFASTBOX_TEST(name) TEST_CASE(#name)
14+
#else
15+
#include "../../include/libc/assert.h"
16+
#define CPPFASTBOX_ASSERT always_assert
17+
#define CPPFASTBOX_TEST(name) void name() noexcept
18+
#endif
19+
using namespace cppfastbox;
20+
21+
CPPFASTBOX_TEST(test_split_into_native_ls_lanes_64)
22+
{
23+
constexpr auto size{128 + 64 + 32 + 16 + 8 + 4 + 2 + 1};
24+
auto lanes{split_into_native_ls_lanes<64>(size)};
25+
CPPFASTBOX_ASSERT(lanes.l64 == 3);
26+
CPPFASTBOX_ASSERT(lanes.l32 == 1);
27+
CPPFASTBOX_ASSERT(lanes.l16 == 1);
28+
CPPFASTBOX_ASSERT(lanes.l8 == 1);
29+
CPPFASTBOX_ASSERT(lanes.l4 == 1);
30+
CPPFASTBOX_ASSERT(lanes.l2 == 1);
31+
CPPFASTBOX_ASSERT(lanes.l1 == 1);
32+
}
33+
34+
CPPFASTBOX_TEST(test_split_into_native_ls_lanes_32)
35+
{
36+
constexpr auto size{128 + 64 + 32 + 16 + 4 + 2};
37+
auto lanes{split_into_native_ls_lanes<32>(size)};
38+
CPPFASTBOX_ASSERT(lanes.l64 == 0);
39+
CPPFASTBOX_ASSERT(lanes.l32 == 7);
40+
CPPFASTBOX_ASSERT(lanes.l16 == 1);
41+
CPPFASTBOX_ASSERT(lanes.l8 == 0);
42+
CPPFASTBOX_ASSERT(lanes.l4 == 1);
43+
CPPFASTBOX_ASSERT(lanes.l2 == 1);
44+
CPPFASTBOX_ASSERT(lanes.l1 == 0);
45+
}
46+
47+
CPPFASTBOX_TEST(test_split_into_native_ls_lanes_16)
48+
{
49+
constexpr auto size{128 + 64 + 8 + 4 + 2};
50+
auto lanes{split_into_native_ls_lanes<16>(size)};
51+
CPPFASTBOX_ASSERT(lanes.l64 == 0);
52+
CPPFASTBOX_ASSERT(lanes.l32 == 0);
53+
CPPFASTBOX_ASSERT(lanes.l16 == 12);
54+
CPPFASTBOX_ASSERT(lanes.l8 == 1);
55+
CPPFASTBOX_ASSERT(lanes.l4 == 1);
56+
CPPFASTBOX_ASSERT(lanes.l2 == 1);
57+
CPPFASTBOX_ASSERT(lanes.l1 == 0);
58+
}
59+
60+
CPPFASTBOX_TEST(test_split_into_native_ls_lanes_8)
61+
{
62+
constexpr auto size{128 + 64 + 32 + 16 + 4 + 2 + 1};
63+
auto lanes{split_into_native_ls_lanes<8>(size)};
64+
CPPFASTBOX_ASSERT(lanes.l64 == 0);
65+
CPPFASTBOX_ASSERT(lanes.l32 == 0);
66+
CPPFASTBOX_ASSERT(lanes.l16 == 0);
67+
CPPFASTBOX_ASSERT(lanes.l8 == 30);
68+
CPPFASTBOX_ASSERT(lanes.l4 == 1);
69+
CPPFASTBOX_ASSERT(lanes.l2 == 1);
70+
CPPFASTBOX_ASSERT(lanes.l1 == 1);
71+
}
72+
73+
CPPFASTBOX_TEST(test_split_into_native_ls_lanes_4)
74+
{
75+
constexpr auto size{32 + 16 + 8 + 4 + 2 + 1};
76+
auto lanes{split_into_native_ls_lanes<4>(size)};
77+
CPPFASTBOX_ASSERT(lanes.l64 == 0);
78+
CPPFASTBOX_ASSERT(lanes.l32 == 0);
79+
CPPFASTBOX_ASSERT(lanes.l16 == 0);
80+
CPPFASTBOX_ASSERT(lanes.l8 == 0);
81+
CPPFASTBOX_ASSERT(lanes.l4 == 15);
82+
CPPFASTBOX_ASSERT(lanes.l2 == 1);
83+
CPPFASTBOX_ASSERT(lanes.l1 == 1);
84+
}
85+
86+
CPPFASTBOX_TEST(test_split_into_native_ls_lanes_small)
87+
{
88+
{
89+
constexpr auto size{2 + 1};
90+
auto lanes{split_into_native_ls_lanes<4>(size)};
91+
CPPFASTBOX_ASSERT(lanes.l64 == 0);
92+
CPPFASTBOX_ASSERT(lanes.l32 == 0);
93+
CPPFASTBOX_ASSERT(lanes.l16 == 0);
94+
CPPFASTBOX_ASSERT(lanes.l8 == 0);
95+
CPPFASTBOX_ASSERT(lanes.l4 == 0);
96+
CPPFASTBOX_ASSERT(lanes.l2 == 1);
97+
CPPFASTBOX_ASSERT(lanes.l1 == 1);
98+
}
99+
{
100+
constexpr auto size{1};
101+
auto lanes{split_into_native_ls_lanes<4>(size)};
102+
CPPFASTBOX_ASSERT(lanes.l64 == 0);
103+
CPPFASTBOX_ASSERT(lanes.l32 == 0);
104+
CPPFASTBOX_ASSERT(lanes.l16 == 0);
105+
CPPFASTBOX_ASSERT(lanes.l8 == 0);
106+
CPPFASTBOX_ASSERT(lanes.l4 == 0);
107+
CPPFASTBOX_ASSERT(lanes.l2 == 0);
108+
CPPFASTBOX_ASSERT(lanes.l1 == 1);
109+
}
110+
{
111+
constexpr auto size{0};
112+
auto lanes{split_into_native_ls_lanes<4>(size)};
113+
CPPFASTBOX_ASSERT(lanes.l64 == 0);
114+
CPPFASTBOX_ASSERT(lanes.l32 == 0);
115+
CPPFASTBOX_ASSERT(lanes.l16 == 0);
116+
CPPFASTBOX_ASSERT(lanes.l8 == 0);
117+
CPPFASTBOX_ASSERT(lanes.l4 == 0);
118+
CPPFASTBOX_ASSERT(lanes.l2 == 0);
119+
CPPFASTBOX_ASSERT(lanes.l1 == 0);
120+
}
121+
}
122+
123+
#ifndef CPPFASTBOX_HOSTED_TEST
124+
int main()
125+
{
126+
test_split_into_native_ls_lanes_64();
127+
test_split_into_native_ls_lanes_32();
128+
test_split_into_native_ls_lanes_16();
129+
test_split_into_native_ls_lanes_8();
130+
test_split_into_native_ls_lanes_4();
131+
test_split_into_native_ls_lanes_small();
132+
}
133+
#endif

0 commit comments

Comments
 (0)