diff --git a/be/src/bench/CMakeLists.txt b/be/src/bench/CMakeLists.txt index bacc896e84af5..fd10c4657c3b8 100644 --- a/be/src/bench/CMakeLists.txt +++ b/be/src/bench/CMakeLists.txt @@ -27,4 +27,5 @@ ADD_BE_BENCH(${SRC_DIR}/bench/roaring_bitmap_mem_bench) ADD_BE_BENCH(${SRC_DIR}/bench/parquet_dict_decode_bench) ADD_BE_BENCH(${SRC_DIR}/bench/get_dict_codes_bench) ADD_BE_BENCH(${SRC_DIR}/bench/persistent_index_bench) -ADD_BE_BENCH(${SRC_DIR}/bench/orc_column_reader_bench) \ No newline at end of file +ADD_BE_BENCH(${SRC_DIR}/bench/orc_column_reader_bench) +ADD_BE_BENCH(${SRC_DIR}/bench/hash_functions_bench) diff --git a/be/src/bench/bench.h b/be/src/bench/bench.h index 2ee1069cb8cd7..5513295b3f137 100644 --- a/be/src/bench/bench.h +++ b/be/src/bench/bench.h @@ -44,7 +44,8 @@ class Bench { return column; } - static ColumnPtr create_random_column(const TypeDescriptor& type_desc, int num_rows, bool low_card, bool nullable) { + static ColumnPtr create_random_column(const TypeDescriptor& type_desc, int num_rows, bool low_card, bool nullable, + size_t min_length = 0) { using UniformInt = std::uniform_int_distribution; using PoissonInt = std::poisson_distribution; ColumnPtr column = ColumnHelper::create_column(type_desc, nullable); @@ -64,7 +65,7 @@ class Bench { "abcdefghijklmnopqrstuvwxyz"; auto gen_rand_str = [&]() { - int str_len = uniform_int(rng) % 20; + int str_len = uniform_int(rng) % 20 + min_length; int str_start = std::min(poisson_int(rng) % alphanum.size(), alphanum.size() - str_len); Slice rand_str(alphanum.c_str() + str_start, str_len); return rand_str; diff --git a/be/src/bench/hash_functions_bench.cpp b/be/src/bench/hash_functions_bench.cpp new file mode 100644 index 0000000000000..7b65070a0eff5 --- /dev/null +++ b/be/src/bench/hash_functions_bench.cpp @@ -0,0 +1,90 @@ +// Copyright 2021-present StarRocks, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include +#include + +#include "bench.h" +#include "exprs/hash_functions.h" + +namespace starrocks { + +class HashFunctionsBench { +public: + void SetUp(); + void TearDown() {} + + HashFunctionsBench(size_t num_column, size_t num_rows) : _num_column(num_column), _num_rows(num_rows) {} + + void do_bench(benchmark::State& state, size_t num_column, bool test_default_hash); + +private: + const TypeDescriptor type_desc = TypeDescriptor(TYPE_VARCHAR); + size_t _num_column = 0; + size_t _num_rows = 0; + std::vector _columns{}; +}; + +void HashFunctionsBench::SetUp() { + for (int i = 0; i < _num_column; i++) { + auto columnPtr = Bench::create_random_column(type_desc, _num_rows, false, false, 32); + _columns.push_back(std::move(columnPtr)); + } +} + +void HashFunctionsBench::do_bench(benchmark::State& state, size_t num_rows, bool test_default_hash) { + std::unique_ptr ctx(FunctionContext::create_test_context()); + if (test_default_hash) { + ColumnPtr result = HashFunctions::murmur_hash3_32(ctx.get(), _columns).value(); + auto column = ColumnHelper::cast_to(result); + } else { + ColumnPtr result = HashFunctions::xx_hash3_64(ctx.get(), _columns).value(); + auto column = ColumnHelper::cast_to(result); + } +} + +static void BM_HashFunctions_Eval_Arg(benchmark::internal::Benchmark* b) { + b->Args({10, true}); + b->Args({10, false}); + b->Args({100, true}); + b->Args({100, false}); + b->Args({10000, true}); + b->Args({10000, false}); + b->Args({1000000, true}); + b->Args({1000000, false}); + b->Iterations(10000); +} + +static void BM_HashFunctions_Eval(benchmark::State& state) { + size_t num_rows = state.range(0); + bool test_default_hash = state.range(1); + + HashFunctionsBench hashFunctionsBench(1, num_rows); + hashFunctionsBench.SetUp(); + + for (auto _ : state) { + hashFunctionsBench.do_bench(state, num_rows, test_default_hash); + } +} + +BENCHMARK(BM_HashFunctions_Eval)->Apply(BM_HashFunctions_Eval_Arg); + +} // namespace starrocks + +BENCHMARK_MAIN(); \ No newline at end of file diff --git a/be/src/exprs/hash_functions.h b/be/src/exprs/hash_functions.h index 2d71ad4e0d0a2..99d32ebd6cf2a 100644 --- a/be/src/exprs/hash_functions.h +++ b/be/src/exprs/hash_functions.h @@ -27,6 +27,12 @@ class HashFunctions { * @return IntColumn */ DEFINE_VECTORIZED_FN(murmur_hash3_32); + + /** + * @param columns: [BinaryColumn, ...] + * @return BigIntColumn + */ + DEFINE_VECTORIZED_FN(xx_hash3_64); }; inline StatusOr HashFunctions::murmur_hash3_32(FunctionContext* context, const starrocks::Columns& columns) { @@ -58,4 +64,43 @@ inline StatusOr HashFunctions::murmur_hash3_32(FunctionContext* conte return builder.build(ColumnHelper::is_all_const(columns)); } +inline StatusOr HashFunctions::xx_hash3_64(FunctionContext* context, const starrocks::Columns& columns) { + std::vector> column_viewers; + + column_viewers.reserve(columns.size()); + for (const auto& column : columns) { + column_viewers.emplace_back(column); + } + + const uint64_t default_xxhash_seed = HashUtil::XXHASH3_64_SEED; + + size_t row_size = columns[0]->size(); + std::vector seeds_vec(row_size, default_xxhash_seed); + std::vector is_null_vec(row_size, false); + + for (const auto& viewer : column_viewers) { + for (size_t row = 0; row < row_size; ++row) { + if (is_null_vec[row]) { + continue; + } + + if (viewer.is_null(row)) { + is_null_vec[row] = true; + continue; + } + + auto slice = viewer.value(row); + uint64_t seed = seeds_vec[row]; + seeds_vec[row] = HashUtil::xx_hash3_64(slice.data, slice.size, seed); + } + } + + ColumnBuilder builder(row_size); + for (int row = 0; row < row_size; ++row) { + builder.append(seeds_vec[row], is_null_vec[row]); + } + + return builder.build(ColumnHelper::is_all_const(columns)); +} + } // namespace starrocks diff --git a/be/src/util/CMakeLists.txt b/be/src/util/CMakeLists.txt index 9b3c70866a433..784a83f964176 100644 --- a/be/src/util/CMakeLists.txt +++ b/be/src/util/CMakeLists.txt @@ -101,6 +101,7 @@ set(UTIL_FILES failpoint/fail_point.cpp bthreads/future.h bthreads/future_impl.cpp + hash_util.cpp ) add_library(Util STATIC diff --git a/be/src/util/hash_util.cpp b/be/src/util/hash_util.cpp new file mode 100644 index 0000000000000..90e622dd372ce --- /dev/null +++ b/be/src/util/hash_util.cpp @@ -0,0 +1,25 @@ +// Copyright 2021-present StarRocks, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "util/hash_util.hpp" + +#include "util/xxh3.h" + +namespace starrocks { + +uint64_t HashUtil::xx_hash3_64(const void* key, int32_t len, uint64_t seed) { + return XXH3_64bits_withSeed(key, len, seed); +} + +} // namespace starrocks \ No newline at end of file diff --git a/be/src/util/hash_util.hpp b/be/src/util/hash_util.hpp index f29fd101acfb7..3113fe76dd735 100644 --- a/be/src/util/hash_util.hpp +++ b/be/src/util/hash_util.hpp @@ -131,6 +131,7 @@ class HashUtil { // refer to https://github.com/apache/commons-codec/blob/master/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java static const uint32_t MURMUR3_32_SEED = 104729; + static const uint64_t XXHASH3_64_SEED = 0; ALWAYS_INLINE static uint32_t rotl32(uint32_t x, int8_t r) { return (x << r) | (x >> (32 - r)); } @@ -186,6 +187,8 @@ class HashUtil { return h1; } + static uint64_t xx_hash3_64(const void* key, int32_t len, uint64_t seed); + // default values recommended by http://isthe.com/chongo/tech/comp/fnv/ static const uint32_t FNV_PRIME = 0x01000193; // 16777619 static constexpr uint32_t FNV_SEED = 0x811C9DC5; // 2166136261 diff --git a/be/test/exprs/hash_functions_test.cpp b/be/test/exprs/hash_functions_test.cpp index e05dc855551f8..741b09e6ea86c 100644 --- a/be/test/exprs/hash_functions_test.cpp +++ b/be/test/exprs/hash_functions_test.cpp @@ -81,6 +81,64 @@ TEST_F(HashFunctionsTest, hashTest) { } } +TEST_F(HashFunctionsTest, test_xx_hash3_64) { + { + Columns columns; + auto tc1 = BinaryColumn::create(); + tc1->append("hello"); + tc1->append("starrocks"); + columns.emplace_back(tc1); + + std::unique_ptr ctx(FunctionContext::create_test_context()); + ColumnPtr result = HashFunctions::xx_hash3_64(ctx.get(), columns).value(); + + auto v = ColumnHelper::cast_to(result); + ASSERT_EQ(-7685981735718036227, v->get_data()[0]); + ASSERT_EQ(6573472450560322992, v->get_data()[1]); + } + + { + Columns columns; + auto tc1 = BinaryColumn::create(); + tc1->append("hello"); + tc1->append("hello"); + + auto tc2 = BinaryColumn::create(); + tc2->append("world"); + tc2->append("starrocks"); + + columns.emplace_back(tc1); + columns.emplace_back(tc2); + + std::unique_ptr ctx(FunctionContext::create_test_context()); + ColumnPtr result = HashFunctions::xx_hash3_64(ctx.get(), columns).value(); + + auto v = ColumnHelper::cast_to(result); + ASSERT_EQ(7001965798170371843, v->get_data()[0]); + ASSERT_EQ(2803320466222626098, v->get_data()[1]); + } + + { + Columns columns; + auto tc1 = BinaryColumn::create(); + tc1->append("hello"); + + auto tc2 = ColumnHelper::create_const_null_column(1); + + auto tc3 = BinaryColumn::create(); + tc3->append("world"); + + columns.emplace_back(tc1); + columns.emplace_back(tc2); + columns.emplace_back(tc3); + + std::unique_ptr ctx(FunctionContext::create_test_context()); + ColumnPtr result = HashFunctions::xx_hash3_64(ctx.get(), columns).value(); + + ASSERT_TRUE(result->is_null(0)); + } +} + TEST_F(HashFunctionsTest, emptyTest) { uint32_t h3 = 123456; diff --git a/docs/sql-reference/sql-functions/hash-functions/xx_hash3_64.md b/docs/sql-reference/sql-functions/hash-functions/xx_hash3_64.md new file mode 100644 index 0000000000000..48d892047ea16 --- /dev/null +++ b/docs/sql-reference/sql-functions/hash-functions/xx_hash3_64.md @@ -0,0 +1,40 @@ +# xx_hash3_64 + +## Description + +Returns the 64-bit xxhash3 hash value of the input string. + +## Syntax + +```Haskell +BIGINT XX_HASH3_64(VARCHAR input, ...) +``` + +## Examples + +```Plain Text +MySQL > select xx_hash3_64(null); ++-------------------+ +| xx_hash3_64(NULL) | ++-------------------+ +| NULL | ++-------------------+ + +MySQL > select xx_hash3_64("hello"); ++----------------------+ +| xx_hash3_64('hello') | ++----------------------+ +| -7685981735718036227 | ++----------------------+ + +MySQL > select xx_hash3_64("hello", "world"); ++-------------------------------+ +| xx_hash3_64('hello', 'world') | ++-------------------------------+ +| 7001965798170371843 | ++-------------------------------+ +``` + +## keyword + +XX_HASH3_64,HASH diff --git a/gensrc/script/functions.py b/gensrc/script/functions.py index b93152d659945..0a30d8f503c3b 100644 --- a/gensrc/script/functions.py +++ b/gensrc/script/functions.py @@ -618,6 +618,7 @@ # hash function [100010, 'murmur_hash3_32', 'INT', ['VARCHAR', '...'], 'HashFunctions::murmur_hash3_32'], + [100021, 'xx_hash3_64', 'BIGINT', ['VARCHAR', '...'], 'HashFunctions::xx_hash3_64'], # Utility functions [100011, 'sleep', 'BOOLEAN', ['INT'], "UtilityFunctions::sleep"],