Skip to content

Commit

Permalink
[enhancement] support simd instructions on arm cpus through sse2neon (a…
Browse files Browse the repository at this point in the history
…pache#10068)

* [enhancement] support simd instructions on arm cpus through sse2neon
  • Loading branch information
yangzhg authored Jun 14, 2022
1 parent 7cf0cc7 commit 39a2785
Show file tree
Hide file tree
Showing 20 changed files with 106 additions and 3,177 deletions.
2 changes: 1 addition & 1 deletion be/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@ if ("${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86" OR "${CMAKE_BUILD_TARGET_ARCH}"
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -mavx2")
endif()
endif()
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-attributes -DS2_USE_GFLAGS -DS2_USE_GLOG")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-attributes -DS2_USE_GFLAGS -DS2_USE_GLOG")

if (WITH_MYSQL)
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -DDORIS_WITH_MYSQL")
Expand Down
12 changes: 3 additions & 9 deletions be/src/exprs/block_bloom_filter_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
// and modified by Doris

#ifdef __aarch64__
#include "util/sse2neon.h"
#else //__aarch64__
#include <sse2neon.h>
#else
#include <emmintrin.h>
#include <mm_malloc.h>
#endif
Expand Down Expand Up @@ -115,16 +115,10 @@ void BlockBloomFilter::bucket_insert(const uint32_t bucket_idx, const uint32_t h
new_bucket[i] = 1U << new_bucket[i];
}
for (int i = 0; i < 2; ++i) {
#ifdef __aarch64__
uint8x16_t new_bucket_neon = vreinterpretq_u8_u32(vld1q_u32(new_bucket + 4 * i));
uint8x16_t* existing_bucket = reinterpret_cast<uint8x16_t*>(&_directory[bucket_idx][4 * i]);
*existing_bucket = vorrq_u8(*existing_bucket, new_bucket_neon);
#else
__m128i new_bucket_sse = _mm_load_si128(reinterpret_cast<__m128i*>(new_bucket + 4 * i));
__m128i* existing_bucket =
reinterpret_cast<__m128i*>(&DCHECK_NOTNULL(_directory)[bucket_idx][4 * i]);
*existing_bucket = _mm_or_si128(*existing_bucket, new_bucket_sse);
#endif
}
}

Expand Down Expand Up @@ -194,7 +188,7 @@ Status BlockBloomFilter::or_equal_array(size_t n, const uint8_t* __restrict__ in

void BlockBloomFilter::or_equal_array_no_avx2(size_t n, const uint8_t* __restrict__ in,
uint8_t* __restrict__ out) {
#ifdef __SSE4_2__
#if defined(__SSE4_2__) || defined(__aarch64__)
// The trivial loop out[i] |= in[i] should auto-vectorize with gcc at -O3, but it is not
// written in a way that is very friendly to auto-vectorization. Instead, we manually
// vectorize, increasing the speed by up to 56x.
Expand Down
2 changes: 1 addition & 1 deletion be/src/util/bit_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
#include "gutil/bits.h"
#include "util/cpu_info.h"
#ifdef __aarch64__
#include "sse2neon.h"
#include <sse2neon.h>
#else
#include <emmintrin.h>
#include <immintrin.h>
Expand Down
14 changes: 9 additions & 5 deletions be/src/util/crc32c.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
// https://github.com/facebook/rocksdb/blob/master/util/crc32c.cc

#include "util/crc32c.h"
#ifdef __SSE4_2__
#if defined(__SSE4_2__)
#include <nmmintrin.h>
#elif defined(__aarch64__)
#include <sse2neon.h>
#endif
#include "util/coding.h"

Expand Down Expand Up @@ -204,9 +206,8 @@ static inline uint64_t LE_LOAD64(const uint8_t* p) {
}

static inline void Fast_CRC32(uint64_t* l, uint8_t const** p) {
#ifndef __SSE4_2__
Slow_CRC32(l, p);
#elif defined(__LP64__) || defined(_WIN64)
#if defined(__SSE4_2__) || defined(__aarch64__)
#if (defined(__LP64__) || defined(_WIN64)) && !defined(__aarch64__)
*l = _mm_crc32_u64(*l, LE_LOAD64(*p));
*p += 8;
#else
Expand All @@ -215,6 +216,9 @@ static inline void Fast_CRC32(uint64_t* l, uint8_t const** p) {
*l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p));
*p += 4;
#endif
#else
Slow_CRC32(l, p);
#endif
}

template <void (*CRC32)(uint64_t*, uint8_t const**)>
Expand Down Expand Up @@ -261,7 +265,7 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
}

uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
#ifdef __SSE4_2__
#if defined(__SSE4_2__) || defined(__aarch64__)
return ExtendImpl<Fast_CRC32>(crc, buf, size);
#else
return ExtendImpl<Slow_CRC32>(crc, buf, size);
Expand Down
4 changes: 3 additions & 1 deletion be/src/util/hash_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
// the code that is built and the runtime checks to control what code is run.
#ifdef __SSE4_2__
#include <nmmintrin.h>
#elif __aarch64__
#include <sse2neon.h>
#endif
#include <zlib.h>

Expand All @@ -44,7 +46,7 @@ class HashUtil {
static uint32_t zlib_crc_hash(const void* data, int32_t bytes, uint32_t hash) {
return crc32(hash, (const unsigned char*)data, bytes);
}
#ifdef __SSE4_2__
#if defined(__SSE4_2__) || defined(__aarch64__)
// Compute the Crc32 hash for data using SSE4 instructions. The input hash parameter is
// the current hash/seed value.
// This should only be called if SSE is supported.
Expand Down
4 changes: 3 additions & 1 deletion be/src/util/simd/bits.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
#include <immintrin.h>
#elif __SSE2__
#include <emmintrin.h>
#elif __aarch64__
#include <sse2neon.h>
#endif

namespace doris {
Expand All @@ -35,7 +37,7 @@ inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) {
auto zero32 = _mm256_setzero_si256();
uint32_t mask = static_cast<uint32_t>(_mm256_movemask_epi8(
_mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)), zero32)));
#elif __SSE2__
#elif defined(__SSE2__) || defined(__aarch64__)
auto zero16 = _mm_setzero_si128();
uint32_t mask =
(static_cast<uint32_t>(_mm_movemask_epi8(_mm_cmpgt_epi8(
Expand Down
4 changes: 3 additions & 1 deletion be/src/util/simd/lower_upper_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

#ifdef __SSE2__
#include <emmintrin.h>
#elif __aarch64__
#include <sse2neon.h>
#endif
#include <stdint.h>

Expand All @@ -35,7 +37,7 @@ class LowerUpperImpl {
static void transfer(const uint8_t* src, const uint8_t* src_end, uint8_t* dst) {
const auto flip_case_mask = 'A' ^ 'a';

#ifdef __SSE2__
#if defined(__SSE2__) || defined(__aarch64__)
const auto bytes_sse = sizeof(__m128i);
const auto src_end_sse = src_end - (src_end - src) % bytes_sse;

Expand Down
12 changes: 8 additions & 4 deletions be/src/util/simd/vstring_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@

#include <cstdint>

#ifdef __aarch64__
#include <sse2neon.h>
#endif

#include "runtime/string_value.hpp"
#include "util/simd/lower_upper_impl.h"

Expand Down Expand Up @@ -48,7 +52,7 @@ namespace simd {

class VStringFunctions {
public:
#ifdef __SSE2__
#if defined(__SSE2__) || defined(__aarch64__)
/// n equals to 16 chars length
static constexpr auto REGISTER_SIZE = sizeof(__m128i);
#endif
Expand All @@ -59,7 +63,7 @@ class VStringFunctions {
}
auto begin = 0;
auto end = str.len - 1;
#ifdef __SSE2__
#if defined(__SSE2__) || defined(__aarch64__)
char blank = ' ';
const auto pattern = _mm_set1_epi8(blank);
while (end - begin + 1 >= REGISTER_SIZE) {
Expand Down Expand Up @@ -91,7 +95,7 @@ class VStringFunctions {
}
auto begin = 0;
auto end = str.len - 1;
#ifdef __SSE2__
#if defined(__SSE2__) || defined(__aarch64__)
char blank = ' ';
const auto pattern = _mm_set1_epi8(blank);
while (end - begin + 1 >= REGISTER_SIZE) {
Expand Down Expand Up @@ -155,7 +159,7 @@ class VStringFunctions {
static constexpr auto hex_table = "0123456789ABCDEF";
auto src_str_end = src_str + length;

#if defined(__SSE2__)
#if defined(__SSE2__) || defined(__aarch64__)
constexpr auto step = sizeof(uint64);
if (src_str + step < src_str_end) {
const auto hex_map = _mm_loadu_si128(reinterpret_cast<const __m128i*>(hex_table));
Expand Down
Loading

0 comments on commit 39a2785

Please sign in to comment.