[enhancement] support simd instructions on arm cpus through sse2neon (a…

…pache#10068) * [enhancement] support simd instructions on arm cpus through sse2neon
adonis0147 · Jun 14, 2022 · 39a2785 · 39a2785
1 parent 7cf0cc7
commit 39a2785
Show file tree

Hide file tree

Showing 20 changed files with 106 additions and 3,177 deletions.
diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt
@@ -412,7 +412,7 @@ if ("${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86" OR "${CMAKE_BUILD_TARGET_ARCH}"
         set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -mavx2")
     endif()
 endif()
-set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS}  -Wno-attributes -DS2_USE_GFLAGS -DS2_USE_GLOG")
+set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-attributes -DS2_USE_GFLAGS -DS2_USE_GLOG")
 
 if (WITH_MYSQL)
     set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -DDORIS_WITH_MYSQL")

diff --git a/be/src/exprs/block_bloom_filter_impl.cc b/be/src/exprs/block_bloom_filter_impl.cc
@@ -20,8 +20,8 @@
 // and modified by Doris
 
 #ifdef __aarch64__
-#include "util/sse2neon.h"
-#else //__aarch64__
+#include <sse2neon.h>
+#else
 #include <emmintrin.h>
 #include <mm_malloc.h>
 #endif
@@ -115,16 +115,10 @@ void BlockBloomFilter::bucket_insert(const uint32_t bucket_idx, const uint32_t h
         new_bucket[i] = 1U << new_bucket[i];
     }
     for (int i = 0; i < 2; ++i) {
-#ifdef __aarch64__
-        uint8x16_t new_bucket_neon = vreinterpretq_u8_u32(vld1q_u32(new_bucket + 4 * i));
-        uint8x16_t* existing_bucket = reinterpret_cast<uint8x16_t*>(&_directory[bucket_idx][4 * i]);
-        *existing_bucket = vorrq_u8(*existing_bucket, new_bucket_neon);
-#else
         __m128i new_bucket_sse = _mm_load_si128(reinterpret_cast<__m128i*>(new_bucket + 4 * i));
         __m128i* existing_bucket =
                 reinterpret_cast<__m128i*>(&DCHECK_NOTNULL(_directory)[bucket_idx][4 * i]);
         *existing_bucket = _mm_or_si128(*existing_bucket, new_bucket_sse);
-#endif
     }
 }
 
@@ -194,7 +188,7 @@ Status BlockBloomFilter::or_equal_array(size_t n, const uint8_t* __restrict__ in
 
 void BlockBloomFilter::or_equal_array_no_avx2(size_t n, const uint8_t* __restrict__ in,
                                               uint8_t* __restrict__ out) {
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__) || defined(__aarch64__)
     // The trivial loop out[i] |= in[i] should auto-vectorize with gcc at -O3, but it is not
     // written in a way that is very friendly to auto-vectorization. Instead, we manually
     // vectorize, increasing the speed by up to 56x.

diff --git a/be/src/util/bit_util.h b/be/src/util/bit_util.h
@@ -26,7 +26,7 @@
 #include "gutil/bits.h"
 #include "util/cpu_info.h"
 #ifdef __aarch64__
-#include "sse2neon.h"
+#include <sse2neon.h>
 #else
 #include <emmintrin.h>
 #include <immintrin.h>

diff --git a/be/src/util/crc32c.cpp b/be/src/util/crc32c.cpp
@@ -19,8 +19,10 @@
 // https://github.com/facebook/rocksdb/blob/master/util/crc32c.cc
 
 #include "util/crc32c.h"
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__)
 #include <nmmintrin.h>
+#elif defined(__aarch64__)
+#include <sse2neon.h>
 #endif
 #include "util/coding.h"
 
@@ -204,9 +206,8 @@ static inline uint64_t LE_LOAD64(const uint8_t* p) {
 }
 
 static inline void Fast_CRC32(uint64_t* l, uint8_t const** p) {
-#ifndef __SSE4_2__
-    Slow_CRC32(l, p);
-#elif defined(__LP64__) || defined(_WIN64)
+#if defined(__SSE4_2__) || defined(__aarch64__)
+#if (defined(__LP64__) || defined(_WIN64)) && !defined(__aarch64__)
     *l = _mm_crc32_u64(*l, LE_LOAD64(*p));
     *p += 8;
 #else
@@ -215,6 +216,9 @@ static inline void Fast_CRC32(uint64_t* l, uint8_t const** p) {
     *l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p));
     *p += 4;
 #endif
+#else
+    Slow_CRC32(l, p);
+#endif
 }
 
 template <void (*CRC32)(uint64_t*, uint8_t const**)>
@@ -261,7 +265,7 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
 }
 
 uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__) || defined(__aarch64__)
     return ExtendImpl<Fast_CRC32>(crc, buf, size);
 #else
     return ExtendImpl<Slow_CRC32>(crc, buf, size);

diff --git a/be/src/util/hash_util.hpp b/be/src/util/hash_util.hpp
@@ -29,6 +29,8 @@
 // the code that is built and the runtime checks to control what code is run.
 #ifdef __SSE4_2__
 #include <nmmintrin.h>
+#elif __aarch64__
+#include <sse2neon.h>
 #endif
 #include <zlib.h>
 
@@ -44,7 +46,7 @@ class HashUtil {
     static uint32_t zlib_crc_hash(const void* data, int32_t bytes, uint32_t hash) {
         return crc32(hash, (const unsigned char*)data, bytes);
     }
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__) || defined(__aarch64__)
     // Compute the Crc32 hash for data using SSE4 instructions.  The input hash parameter is
     // the current hash/seed value.
     // This should only be called if SSE is supported.

diff --git a/be/src/util/simd/bits.h b/be/src/util/simd/bits.h
@@ -23,6 +23,8 @@
 #include <immintrin.h>
 #elif __SSE2__
 #include <emmintrin.h>
+#elif __aarch64__
+#include <sse2neon.h>
 #endif
 
 namespace doris {
@@ -35,7 +37,7 @@ inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) {
     auto zero32 = _mm256_setzero_si256();
     uint32_t mask = static_cast<uint32_t>(_mm256_movemask_epi8(
             _mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)), zero32)));
-#elif __SSE2__
+#elif defined(__SSE2__) || defined(__aarch64__)
     auto zero16 = _mm_setzero_si128();
     uint32_t mask =
             (static_cast<uint32_t>(_mm_movemask_epi8(_mm_cmpgt_epi8(

diff --git a/be/src/util/simd/lower_upper_impl.h b/be/src/util/simd/lower_upper_impl.h
@@ -19,6 +19,8 @@
 
 #ifdef __SSE2__
 #include <emmintrin.h>
+#elif __aarch64__
+#include <sse2neon.h>
 #endif
 #include <stdint.h>
 
@@ -35,7 +37,7 @@ class LowerUpperImpl {
     static void transfer(const uint8_t* src, const uint8_t* src_end, uint8_t* dst) {
         const auto flip_case_mask = 'A' ^ 'a';
 
-#ifdef __SSE2__
+#if defined(__SSE2__) || defined(__aarch64__)
         const auto bytes_sse = sizeof(__m128i);
         const auto src_end_sse = src_end - (src_end - src) % bytes_sse;
 

diff --git a/be/src/util/simd/vstring_function.h b/be/src/util/simd/vstring_function.h
@@ -21,6 +21,10 @@
 
 #include <cstdint>
 
+#ifdef __aarch64__
+#include <sse2neon.h>
+#endif
+
 #include "runtime/string_value.hpp"
 #include "util/simd/lower_upper_impl.h"
 
@@ -48,7 +52,7 @@ namespace simd {
 
 class VStringFunctions {
 public:
-#ifdef __SSE2__
+#if defined(__SSE2__) || defined(__aarch64__)
     /// n equals to 16 chars length
     static constexpr auto REGISTER_SIZE = sizeof(__m128i);
 #endif
@@ -59,7 +63,7 @@ class VStringFunctions {
         }
         auto begin = 0;
         auto end = str.len - 1;
-#ifdef __SSE2__
+#if defined(__SSE2__) || defined(__aarch64__)
         char blank = ' ';
         const auto pattern = _mm_set1_epi8(blank);
         while (end - begin + 1 >= REGISTER_SIZE) {
@@ -91,7 +95,7 @@ class VStringFunctions {
         }
         auto begin = 0;
         auto end = str.len - 1;
-#ifdef __SSE2__
+#if defined(__SSE2__) || defined(__aarch64__)
         char blank = ' ';
         const auto pattern = _mm_set1_epi8(blank);
         while (end - begin + 1 >= REGISTER_SIZE) {
@@ -155,7 +159,7 @@ class VStringFunctions {
         static constexpr auto hex_table = "0123456789ABCDEF";
         auto src_str_end = src_str + length;
 
-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__aarch64__)
         constexpr auto step = sizeof(uint64);
         if (src_str + step < src_str_end) {
             const auto hex_map = _mm_loadu_si128(reinterpret_cast<const __m128i*>(hex_table));