Skip to content

Commit

Permalink
[improvement](bitshuffle)Enable avx512 support in bitshuffle for perf…
Browse files Browse the repository at this point in the history
…ormance boost (apache#15972)

As AVX512 is available in most modern processors, it is good to use them if have performance boost.
In latest bitshuffle, AVX512 have been added. We could make it integrated in doris for AVX512 case.

Tested with master branch, queries(SSB query q1.1.sql~q4.3.sql total 13 queries) can be boost from 1.4%~3.2%. (use run-ssb-queries.sh 5 times, each time with 100 iterations.)

Signed-off-by: Wu, Kaiqiang <kaiqiang.wu@intel.com>
Co-authored-by: vesslanjin <jun.i.jin@intel.com>
Change-Id: Iec7a7e6640ef9c3873b3f966e9c7f42d25d2e73e
  • Loading branch information
2 people authored and luozenglin committed Sep 13, 2023
1 parent db0dbda commit 8a62f3f
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 7 deletions.
4 changes: 4 additions & 0 deletions be/src/gutil/cpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ CPU::CPU()
has_popcnt_(false),
has_avx_(false),
has_avx2_(false),
has_avx512_(false),
has_aesni_(false),
has_non_stop_time_stamp_counter_(false),
is_running_in_vm_(false),
Expand Down Expand Up @@ -201,6 +202,8 @@ void CPU::Initialize() {
(xgetbv(0) & 6) == 6 /* XSAVE enabled by kernel */;
has_aesni_ = (cpu_info[2] & 0x02000000) != 0;
has_avx2_ = has_avx_ && (cpu_info7[1] & 0x00000020) != 0;
has_avx512_ = has_avx2_ && (cpu_info7[1] & 0x00010000) != 0 &&
(cpu_info7[1] & 0x40000000) != 0 && (cpu_info7[1] & 0x80000000) != 0;
}
// Get the brand string of the cpu.
__cpuid(cpu_info, 0x80000000);
Expand Down Expand Up @@ -253,6 +256,7 @@ void CPU::Initialize() {
#endif
}
CPU::IntelMicroArchitecture CPU::GetIntelMicroArchitecture() const {
if (has_avx512()) return AVX512;
if (has_avx2()) return AVX2;
if (has_avx()) return AVX;
if (has_sse42()) return SSE42;
Expand Down
3 changes: 3 additions & 0 deletions be/src/gutil/cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class CPU final {
SSE42,
AVX,
AVX2,
AVX512,
MAX_INTEL_MICRO_ARCHITECTURE
};
// Accessors for CPU information.
Expand All @@ -81,6 +82,7 @@ class CPU final {
bool has_popcnt() const { return has_popcnt_; }
bool has_avx() const { return has_avx_; }
bool has_avx2() const { return has_avx2_; }
bool has_avx512() const { return has_avx512_; }
bool has_aesni() const { return has_aesni_; }
bool has_non_stop_time_stamp_counter() const { return has_non_stop_time_stamp_counter_; }
bool is_running_in_vm() const { return is_running_in_vm_; }
Expand All @@ -107,6 +109,7 @@ class CPU final {
bool has_popcnt_;
bool has_avx_;
bool has_avx2_;
bool has_avx512_;
bool has_aesni_;
bool has_non_stop_time_stamp_counter_;
bool is_running_in_vm_;
Expand Down
17 changes: 16 additions & 1 deletion be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,17 @@
#undef bshuf_compress_lz4
#undef bshuf_decompress_lz4

// Include the bitshuffle header again, but this time importing the
// AVX512-compiled symbols by defining some macros.
#undef BITSHUFFLE_H
#define bshuf_compress_lz4_bound bshuf_compress_lz4_bound_avx512
#define bshuf_compress_lz4 bshuf_compress_lz4_avx512
#define bshuf_decompress_lz4 bshuf_decompress_lz4_avx512
#include <bitshuffle/bitshuffle.h> // NOLINT(*)
#undef bshuf_compress_lz4_bound
#undef bshuf_compress_lz4
#undef bshuf_decompress_lz4

using base::CPU;

namespace doris {
Expand All @@ -54,7 +65,11 @@ decltype(&bshuf_decompress_lz4) g_bshuf_decompress_lz4;
// the cost of a 'std::once' call.
__attribute__((constructor)) void SelectBitshuffleFunctions() {
#if (defined(__i386) || defined(__x86_64__))
if (CPU().has_avx2()) {
if (CPU().has_avx512()) {
g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound_avx512;
g_bshuf_compress_lz4 = bshuf_compress_lz4_avx512;
g_bshuf_decompress_lz4 = bshuf_decompress_lz4_avx512;
} else if (CPU().has_avx2()) {
g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound_avx2;
g_bshuf_compress_lz4 = bshuf_compress_lz4_avx2;
g_bshuf_decompress_lz4 = bshuf_decompress_lz4_avx2;
Expand Down
15 changes: 9 additions & 6 deletions thirdparty/build-thirdparty.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1049,11 +1049,11 @@ build_bitshuffle() {
cd "${TP_SOURCE_DIR}/${BITSHUFFLE_SOURCE}"
PREFIX="${TP_INSTALL_DIR}"

# This library has significant optimizations when built with -mavx2. However,
# we still need to support non-AVX2-capable hardware. So, we build it twice,
# once with the flag and once without, and use some linker tricks to
# suffix the AVX2 symbols with '_avx2'.
arches=('default' 'avx2')
# This library has significant optimizations when built with AVX2/AVX512. However,
# we still need to support non-AVX2-capable hardware. So, we build it three times,
# with the flag AVX2, AVX512 each and once without, and use some linker tricks to
# suffix the AVX2 symbols with '_avx2', AVX512 symbols with '_avx512'
arches=('default' 'avx2' 'avx512')
MACHINE_TYPE="$(uname -m)"
# Becuase aarch64 don't support avx2, disable it.
if [[ "${MACHINE_TYPE}" == "aarch64" || "${MACHINE_TYPE}" == 'arm64' ]]; then
Expand All @@ -1066,6 +1066,9 @@ build_bitshuffle() {
if [[ "${arch}" == "avx2" ]]; then
arch_flag="-mavx2"
fi
if [[ "${arch}" == "avx512" ]]; then
arch_flag="-mavx512bw -mavx512f"
fi
tmp_obj="bitshuffle_${arch}_tmp.o"
dst_obj="bitshuffle_${arch}.o"
"${CC}" ${EXTRA_CFLAGS:+${EXTRA_CFLAGS}} ${arch_flag:+${arch_flag}} -std=c99 "-I${PREFIX}/include/lz4" -O3 -DNDEBUG -c \
Expand All @@ -1075,7 +1078,7 @@ build_bitshuffle() {
# Merge the object files together to produce a combined .o file.
"${ld}" -r -o "${tmp_obj}" bitshuffle_core.o bitshuffle.o iochain.o
# For the AVX2 symbols, suffix them.
if [[ "${arch}" == "avx2" ]]; then
if [[ "${arch}" == "avx2" ]] || [[ "${arch}" == "avx512" ]]; then
local nm="${DORIS_BIN_UTILS}/nm"
local objcopy="${DORIS_BIN_UTILS}/objcopy"

Expand Down

0 comments on commit 8a62f3f

Please sign in to comment.