改进split_into_native_ls_lanes向量化

24bit-xjkp · 24bit-xjkp · commit 9bf84f9a368b · 2024-02-22T18:00:58.000+08:00
diff --git a/include/base/utility.h b/include/base/utility.h
@@ -12,13 +12,21 @@
 
 namespace cppfastbox
 {
+    // 向量builtin和intrinsic中使用的int8_t
     using simd_int8_t = ::std::conditional_t<::cppfastbox::is_cpu_arch<::cppfastbox::cpu_arch::x86>, char, int8_t>;
+    // 向量builtin和intrinsic中使用的int16_t
     using simd_int16_t = ::std::conditional_t<::cppfastbox::is_cpu_arch<::cppfastbox::cpu_arch::x86>, short, int16_t>;
+    // 向量builtin和intrinsic中使用的int32_t
     using simd_int32_t = ::std::conditional_t<::cppfastbox::is_cpu_arch<::cppfastbox::cpu_arch::x86>, int, int32_t>;
+    // 向量builtin和intrinsic中使用的int64_t
     using simd_int64_t = ::std::conditional_t<::cppfastbox::is_cpu_arch<::cppfastbox::cpu_arch::x86>, char, int64_t>;
+    // 向量builtin和intrinsic中使用的uint8_t
     using simd_uint8_t = ::std::make_unsigned_t<::cppfastbox::simd_int8_t>;
+    // 向量builtin和intrinsic中使用的uint16_t
     using simd_uint16_t = ::std::make_unsigned_t<::cppfastbox::simd_int16_t>;
+    // 向量builtin和intrinsic中使用的uint32_t
     using simd_uint32_t = ::std::make_unsigned_t<::cppfastbox::simd_int32_t>;
+    // 向量builtin和intrinsic中使用的uint64_t
     using simd_uint64_t = ::std::make_unsigned_t<::cppfastbox::simd_int64_t>;
 }  // namespace cppfastbox
 
@@ -390,7 +398,7 @@ namespace cppfastbox
      *
      * @tparam lane_max_size 分解时最大读写通道的大小
      * @param size 要分解的读写操作的字节数
-     * @note 分解时会尝试向量化，使用的向量大小等同于`lane_max_size`
+     * @note 分解时会尝试向量化
      */
     template <::std::size_t lane_max_size = ::cppfastbox::native_ls_lane_max_size>
     constexpr inline auto split_into_native_ls_lanes(::std::size_t size) noexcept
@@ -417,8 +425,8 @@ namespace cppfastbox
             {
                 using v [[gnu::vector_size(64)]] = ::std::size_t;
                 v vsize{size, size, size, size, size, size, size, size};
-                vsize >>= v{0, 0, 1, 2, 3, 4, 5, 6};
-                vsize &= v{0, 1, 1, 1, 1, 1, 1, -1zu};
+                vsize >>= v{6, 5, 4, 3, 2, 1, 0, 0};
+                vsize &= v{-1zu, 1, 1, 1, 1, 1, 1, 1};
                 __builtin_memcpy(&lanes, &vsize, 64);
             }
         }
@@ -435,19 +443,39 @@ namespace cppfastbox
             }
             else
             {
-                using v [[gnu::vector_size(32)]] = ::std::size_t;
-                constexpr v vand{1, 1, 1, 1};
-                v vsize1{size, size, size, size};
-                v vsize2{size, size, size, size};
-                vsize1 >>= v{3, 4, 5, 64};
-                vsize2 >>= v{0, 0, 1, 2};
-                vsize1 &= vand;
-                vsize2 &= vand;
-                __builtin_memcpy(&lanes, &vsize1, 32);
-                __builtin_memcpy(&lanes.l4, &vsize2, 32);
+                if constexpr(::cppfastbox::cpu_flags::sve_support)
+                {
+                    using v [[gnu::vector_size(64)]] = ::std::size_t;
+                    v vsize{size, size, size, size, size, size, size, size};
+                    vsize >>= v{64, 5, 4, 3, 2, 1, 0, 0};
+                    vsize &= v{1, -1zu, 1, 1, 1, 1, 1, 1};
+                    __builtin_memcpy(&lanes, &vsize, 64);
+                }
+                // 由编译器决定是否向量化
+                else if constexpr(::cppfastbox::cpu_flags::neon_support)
+                {
+                    lanes.l32 = size >> 5;
+                    lanes.l16 = (size >> 4) & 1;
+                    lanes.l8 = (size >> 3) & 1;
+                    lanes.l4 = (size >> 2) & 1;
+                    lanes.l2 = (size >> 1) & 1;
+                    lanes.l1 = size & 1;
+                }
+                else
+                {
+                    using v [[gnu::vector_size(32)]] = ::std::size_t;
+                    v vsize1{size, size, size, size};
+                    v vsize2{size, size, size, size};
+                    vsize1 >>= v{64, 5, 4, 3};
+                    vsize2 >>= v{2, 1, 0, 0};
+                    vsize1 &= v{1, -1zu, 1, 1};
+                    vsize2 &= v{1, 1, 1, 1};
+                    __builtin_memcpy(&lanes, &vsize1, 32);
+                    __builtin_memcpy(&lanes.l4, &vsize2, 32);
+                }
             }
         }
-        // 向量化效果不理想
+        // 由编译器决定是否向量化
         else if constexpr(lane_max_size == 16)
         {
             lanes.l16 = size >> 4;
diff --git a/test/base/split_into_native_ls_lanes_rt.cpp b/test/base/split_into_native_ls_lanes_rt.cpp
@@ -0,0 +1,133 @@
+/**
+ * @file split_into_native_ls_lanes_rt.cpp
+ * @brief 测试split_into_native_ls_lanes的运行时实现
+ *
+ * @copyright Copyright (c) 2024-present Trajectronix Open Source Group
+ *
+ */
+#include "../../include/base/utility.h"
+#ifdef CPPFASTBOX_HOSTED_TEST
+    #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+    #include <doctest/doctest.h>
+    #define CPPFASTBOX_ASSERT CHECK
+    #define CPPFASTBOX_TEST(name) TEST_CASE(#name)
+#else
+    #include "../../include/libc/assert.h"
+    #define CPPFASTBOX_ASSERT always_assert
+    #define CPPFASTBOX_TEST(name) void name() noexcept
+#endif
+using namespace cppfastbox;
+
+CPPFASTBOX_TEST(test_split_into_native_ls_lanes_64)
+{
+    constexpr auto size{128 + 64 + 32 + 16 + 8 + 4 + 2 + 1};
+    auto lanes{split_into_native_ls_lanes<64>(size)};
+    CPPFASTBOX_ASSERT(lanes.l64 == 3);
+    CPPFASTBOX_ASSERT(lanes.l32 == 1);
+    CPPFASTBOX_ASSERT(lanes.l16 == 1);
+    CPPFASTBOX_ASSERT(lanes.l8 == 1);
+    CPPFASTBOX_ASSERT(lanes.l4 == 1);
+    CPPFASTBOX_ASSERT(lanes.l2 == 1);
+    CPPFASTBOX_ASSERT(lanes.l1 == 1);
+}
+
+CPPFASTBOX_TEST(test_split_into_native_ls_lanes_32)
+{
+    constexpr auto size{128 + 64 + 32 + 16 + 4 + 2};
+    auto lanes{split_into_native_ls_lanes<32>(size)};
+    CPPFASTBOX_ASSERT(lanes.l64 == 0);
+    CPPFASTBOX_ASSERT(lanes.l32 == 7);
+    CPPFASTBOX_ASSERT(lanes.l16 == 1);
+    CPPFASTBOX_ASSERT(lanes.l8 == 0);
+    CPPFASTBOX_ASSERT(lanes.l4 == 1);
+    CPPFASTBOX_ASSERT(lanes.l2 == 1);
+    CPPFASTBOX_ASSERT(lanes.l1 == 0);
+}
+
+CPPFASTBOX_TEST(test_split_into_native_ls_lanes_16)
+{
+    constexpr auto size{128 + 64 + 8 + 4 + 2};
+    auto lanes{split_into_native_ls_lanes<16>(size)};
+    CPPFASTBOX_ASSERT(lanes.l64 == 0);
+    CPPFASTBOX_ASSERT(lanes.l32 == 0);
+    CPPFASTBOX_ASSERT(lanes.l16 == 12);
+    CPPFASTBOX_ASSERT(lanes.l8 == 1);
+    CPPFASTBOX_ASSERT(lanes.l4 == 1);
+    CPPFASTBOX_ASSERT(lanes.l2 == 1);
+    CPPFASTBOX_ASSERT(lanes.l1 == 0);
+}
+
+CPPFASTBOX_TEST(test_split_into_native_ls_lanes_8)
+{
+    constexpr auto size{128 + 64 + 32 + 16 + 4 + 2 + 1};
+    auto lanes{split_into_native_ls_lanes<8>(size)};
+    CPPFASTBOX_ASSERT(lanes.l64 == 0);
+    CPPFASTBOX_ASSERT(lanes.l32 == 0);
+    CPPFASTBOX_ASSERT(lanes.l16 == 0);
+    CPPFASTBOX_ASSERT(lanes.l8 == 30);
+    CPPFASTBOX_ASSERT(lanes.l4 == 1);
+    CPPFASTBOX_ASSERT(lanes.l2 == 1);
+    CPPFASTBOX_ASSERT(lanes.l1 == 1);
+}
+
+CPPFASTBOX_TEST(test_split_into_native_ls_lanes_4)
+{
+    constexpr auto size{32 + 16 + 8 + 4 + 2 + 1};
+    auto lanes{split_into_native_ls_lanes<4>(size)};
+    CPPFASTBOX_ASSERT(lanes.l64 == 0);
+    CPPFASTBOX_ASSERT(lanes.l32 == 0);
+    CPPFASTBOX_ASSERT(lanes.l16 == 0);
+    CPPFASTBOX_ASSERT(lanes.l8 == 0);
+    CPPFASTBOX_ASSERT(lanes.l4 == 15);
+    CPPFASTBOX_ASSERT(lanes.l2 == 1);
+    CPPFASTBOX_ASSERT(lanes.l1 == 1);
+}
+
+CPPFASTBOX_TEST(test_split_into_native_ls_lanes_small)
+{
+    {
+        constexpr auto size{2 + 1};
+        auto lanes{split_into_native_ls_lanes<4>(size)};
+        CPPFASTBOX_ASSERT(lanes.l64 == 0);
+        CPPFASTBOX_ASSERT(lanes.l32 == 0);
+        CPPFASTBOX_ASSERT(lanes.l16 == 0);
+        CPPFASTBOX_ASSERT(lanes.l8 == 0);
+        CPPFASTBOX_ASSERT(lanes.l4 == 0);
+        CPPFASTBOX_ASSERT(lanes.l2 == 1);
+        CPPFASTBOX_ASSERT(lanes.l1 == 1);
+    }
+    {
+        constexpr auto size{1};
+        auto lanes{split_into_native_ls_lanes<4>(size)};
+        CPPFASTBOX_ASSERT(lanes.l64 == 0);
+        CPPFASTBOX_ASSERT(lanes.l32 == 0);
+        CPPFASTBOX_ASSERT(lanes.l16 == 0);
+        CPPFASTBOX_ASSERT(lanes.l8 == 0);
+        CPPFASTBOX_ASSERT(lanes.l4 == 0);
+        CPPFASTBOX_ASSERT(lanes.l2 == 0);
+        CPPFASTBOX_ASSERT(lanes.l1 == 1);
+    }
+    {
+        constexpr auto size{0};
+        auto lanes{split_into_native_ls_lanes<4>(size)};
+        CPPFASTBOX_ASSERT(lanes.l64 == 0);
+        CPPFASTBOX_ASSERT(lanes.l32 == 0);
+        CPPFASTBOX_ASSERT(lanes.l16 == 0);
+        CPPFASTBOX_ASSERT(lanes.l8 == 0);
+        CPPFASTBOX_ASSERT(lanes.l4 == 0);
+        CPPFASTBOX_ASSERT(lanes.l2 == 0);
+        CPPFASTBOX_ASSERT(lanes.l1 == 0);
+    }
+}
+
+#ifndef CPPFASTBOX_HOSTED_TEST
+int main()
+{
+    test_split_into_native_ls_lanes_64();
+    test_split_into_native_ls_lanes_32();
+    test_split_into_native_ls_lanes_16();
+    test_split_into_native_ls_lanes_8();
+    test_split_into_native_ls_lanes_4();
+    test_split_into_native_ls_lanes_small();
+}
+#endif