[Feature](NGram BloomFilter Index) add new ngram bloom filter index t…

…o speed up like query (apache#11579) This PR implement the new bloom filter index: NGram bloom filter index, which was proposed in apache#10733. The new index can improve the like query performance greatly, from our some test case , can get order of magnitude improve. For how to use it you can check the docs in this PR, and the index based on the ```enable_function_pushdown```, you need set it to ```true```, to make the index work for like query.
morningman · Dec 28, 2022 · 75aa00d · 75aa00d
1 parent 0f8b15b
commit 75aa00d
Show file tree

Hide file tree

Showing 44 changed files with 1,720 additions and 27 deletions.
diff --git a/.clang-format-ignore b/.clang-format-ignore
@@ -7,3 +7,7 @@ be/src/util/sse2neon.h
 be/src/util/mustache/mustache.h
 be/src/util/mustache/mustache.cc
 be/src/util/utf8_check.cpp
+be/src/util/cityhash102/city.h
+be/src/util/cityhash102/city.cc
+be/src/util/cityhash102/citycrc.h
+be/src/util/cityhash102/config.h
diff --git a/.licenserc.yaml b/.licenserc.yaml
@@ -63,6 +63,7 @@ header:
     - "be/src/util/sse2neo.h"
     - "be/src/util/sse2neon.h"
     - "be/src/util/utf8_check.cpp"
+    - "be/src/util/cityhash102"
     - "build-support/run_clang_format.py"
     - "regression-test/data"
     - "docs/.vuepress/public/css/animate.min.css"

diff --git a/be/src/common/config.h b/be/src/common/config.h
@@ -826,6 +826,7 @@ CONF_Int32(s3_transfer_executor_pool_size, "2");
 CONF_Bool(enable_time_lut, "true");
 CONF_Bool(enable_simdjson_reader, "false");
 
+CONF_mBool(enable_query_like_bloom_filter, "true");
 // number of s3 scanner thread pool size
 CONF_Int32(doris_remote_scanner_thread_pool_thread_num, "16");
 // number of s3 scanner thread pool queue size

diff --git a/be/src/olap/CMakeLists.txt b/be/src/olap/CMakeLists.txt
@@ -40,6 +40,7 @@ add_library(Olap STATIC
     file_helper.cpp
     hll.cpp
     inverted_index_parser.cpp
+    itoken_extractor.cpp
     like_column_predicate.cpp
     key_coder.cpp
     lru_cache.cpp
@@ -93,6 +94,7 @@ add_library(Olap STATIC
     rowset/segment_v2/empty_segment_iterator.cpp
     rowset/segment_v2/segment_writer.cpp
     rowset/segment_v2/block_split_bloom_filter.cpp
+    rowset/segment_v2/ngram_bloom_filter.cpp
     rowset/segment_v2/bloom_filter_index_reader.cpp
     rowset/segment_v2/bloom_filter_index_writer.cpp
     rowset/segment_v2/bloom_filter.cpp

diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h
@@ -156,6 +156,15 @@ class ColumnPredicate {
                                   bool* flags) const {
         DCHECK(false) << "should not reach here";
     }
+
+    virtual std::string get_search_str() const {
+        DCHECK(false) << "should not reach here";
+        return "";
+    }
+
+    virtual void set_page_ng_bf(std::unique_ptr<segment_v2::BloomFilter>) {
+        DCHECK(false) << "should not reach here";
+    }
     uint32_t column_id() const { return _column_id; }
 
     virtual std::string debug_string() const {

diff --git a/be/src/olap/itoken_extractor.cpp b/be/src/olap/itoken_extractor.cpp
@@ -0,0 +1,77 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "itoken_extractor.h"
+
+#include "util/simd/vstring_function.h"
+
+namespace doris {
+
+bool NgramTokenExtractor::next_in_string(const char* data, size_t length, size_t* __restrict pos,
+                                         size_t* __restrict token_start,
+                                         size_t* __restrict token_length) const {
+    *token_start = *pos;
+    *token_length = 0;
+    size_t code_points = 0;
+    for (; code_points < n && *token_start + *token_length < length; ++code_points) {
+        size_t sz = get_utf8_byte_length(static_cast<uint8_t>(data[*token_start + *token_length]));
+        *token_length += sz;
+    }
+    *pos += get_utf8_byte_length(static_cast<uint8_t>(data[*pos]));
+    return code_points == n;
+}
+
+bool NgramTokenExtractor::next_in_string_like(const char* data, size_t length, size_t* pos,
+                                              std::string& token) const {
+    token.clear();
+
+    size_t code_points = 0;
+    bool escaped = false;
+    for (size_t i = *pos; i < length;) {
+        if (escaped && (data[i] == '%' || data[i] == '_' || data[i] == '\\')) {
+            token += data[i];
+            ++code_points;
+            escaped = false;
+            ++i;
+        } else if (!escaped && (data[i] == '%' || data[i] == '_')) {
+            /// This token is too small, go to the next.
+            token.clear();
+            code_points = 0;
+            escaped = false;
+            *pos = ++i;
+        } else if (!escaped && data[i] == '\\') {
+            escaped = true;
+            ++i;
+        } else {
+            const size_t sz = get_utf8_byte_length(static_cast<uint8_t>(data[i]));
+            for (size_t j = 0; j < sz; ++j) {
+                token += data[i + j];
+            }
+            i += sz;
+            ++code_points;
+            escaped = false;
+        }
+
+        if (code_points == n) {
+            *pos += get_utf8_byte_length(static_cast<uint8_t>(data[*pos]));
+            return true;
+        }
+    }
+
+    return false;
+}
+} // namespace doris
diff --git a/be/src/olap/itoken_extractor.h b/be/src/olap/itoken_extractor.h
@@ -0,0 +1,98 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef DORIS_ITOKEN_EXTRACTOR_H
+#define DORIS_ITOKEN_EXTRACTOR_H
+
+#include <stddef.h>
+
+#include <string>
+
+#include "olap/rowset/segment_v2/bloom_filter.h"
+
+namespace doris {
+
+/// Interface for string parsers.
+struct ITokenExtractor {
+    virtual ~ITokenExtractor() = default;
+
+    /// Fast inplace implementation for regular use.
+    /// Gets string (data ptr and len) and start position for extracting next token (state of extractor).
+    /// Returns false if parsing is finished, otherwise returns true.
+    virtual bool next_in_string(const char* data, size_t length, size_t* __restrict pos,
+                                size_t* __restrict token_start,
+                                size_t* __restrict token_length) const = 0;
+
+    /// Special implementation for creating bloom filter for LIKE function.
+    /// It skips unescaped `%` and `_` and supports escaping symbols, but it is less lightweight.
+    virtual bool next_in_string_like(const char* data, size_t length, size_t* pos,
+                                     std::string& out) const = 0;
+
+    virtual void string_to_bloom_filter(const char* data, size_t length,
+                                        segment_v2::BloomFilter& bloom_filter) const = 0;
+
+    virtual bool string_like_to_bloom_filter(const char* data, size_t length,
+                                             segment_v2::BloomFilter& bloom_filter) const = 0;
+};
+
+template <typename Derived>
+class ITokenExtractorHelper : public ITokenExtractor {
+public:
+    void string_to_bloom_filter(const char* data, size_t length,
+                                segment_v2::BloomFilter& bloom_filter) const override {
+        size_t cur = 0;
+        size_t token_start = 0;
+        size_t token_len = 0;
+
+        while (cur < length && static_cast<const Derived*>(this)->next_in_string(
+                                       data, length, &cur, &token_start, &token_len))
+            bloom_filter.add_bytes(data + token_start, token_len);
+    }
+
+    bool string_like_to_bloom_filter(const char* data, size_t length,
+                                     segment_v2::BloomFilter& bloom_filter) const override {
+        size_t cur = 0;
+        bool added = false;
+        std::string token;
+        while (cur < length &&
+               static_cast<const Derived*>(this)->next_in_string_like(data, length, &cur, token)) {
+            bloom_filter.add_bytes(token.data(), token.size());
+            added = true;
+        }
+
+        return added;
+    }
+};
+
+/// Parser extracting all ngrams from string.
+struct NgramTokenExtractor final : public ITokenExtractorHelper<NgramTokenExtractor> {
+public:
+    explicit NgramTokenExtractor(size_t n_) : n(n_) {}
+
+    bool next_in_string(const char* data, size_t length, size_t* __restrict pos,
+                        size_t* __restrict token_start,
+                        size_t* __restrict token_length) const override;
+
+    bool next_in_string_like(const char* data, size_t length, size_t* pos,
+                             std::string& token) const override;
+
+private:
+    size_t n;
+};
+} // namespace doris
+
+#endif //DORIS_ITOKEN_EXTRACTOR_H
diff --git a/be/src/olap/like_column_predicate.h b/be/src/olap/like_column_predicate.h
@@ -47,6 +47,22 @@ class LikeColumnPredicate : public ColumnPredicate {
     void evaluate_and_vec(const vectorized::IColumn& column, uint16_t size,
                           bool* flags) const override;
 
+    std::string get_search_str() const override {
+        return std::string(reinterpret_cast<char*>(pattern.ptr), pattern.len);
+    }
+    bool is_opposite() const { return _opposite; }
+
+    void set_page_ng_bf(std::unique_ptr<segment_v2::BloomFilter> src) override {
+        _page_ng_bf = std::move(src);
+    }
+    bool evaluate_and(const BloomFilter* bf) const override {
+        if (_page_ng_bf) {
+            return bf->contains(*_page_ng_bf);
+        }
+        return true;
+    }
+    bool can_do_bloom_filter() const override { return true; }
+
 private:
     template <bool is_and>
     void _evaluate_vec(const vectorized::IColumn& column, uint16_t size, bool* flags) const {
@@ -130,9 +146,11 @@ class LikeColumnPredicate : public ColumnPredicate {
 
     StateType* _state;
 
-    // A separate scratch region is required for every concurrent caller of the Hyperscan API.
-    // So here _like_state is separate for each instance of LikeColumnPredicate.
+    // A separate scratch region is required for every concurrent caller of the
+    // Hyperscan API. So here _like_state is separate for each instance of
+    // LikeColumnPredicate.
     vectorized::LikeSearchState _like_state;
+    std::unique_ptr<segment_v2::BloomFilter> _page_ng_bf; // for ngram-bf index
 };
 
-} //namespace doris
+} // namespace doris
diff --git a/be/src/olap/reader.cpp b/be/src/olap/reader.cpp
@@ -22,6 +22,11 @@
 #include "common/status.h"
 #include "exprs/create_predicate_function.h"
 #include "exprs/hybrid_set.h"
+#include "gen_cpp/segment_v2.pb.h"
+#include "olap/bloom_filter_predicate.h"
+#include "olap/comparison_predicate.h"
+#include "olap/in_list_predicate.h"
+#include "olap/itoken_extractor.h"
 #include "olap/like_column_predicate.h"
 #include "olap/olap_common.h"
 #include "olap/predicate_creator.h"
@@ -463,8 +468,36 @@ void TabletReader::_init_conditions_param(const ReaderParams& read_params) {
     }
 
     // Function filter push down to storage engine
+    auto is_like_predicate = [](ColumnPredicate* _pred) {
+        if (dynamic_cast<LikeColumnPredicate<false>*>(_pred) ||
+            dynamic_cast<LikeColumnPredicate<true>*>(_pred)) {
+            return true;
+        }
+
+        return false;
+    };
+
     for (const auto& filter : read_params.function_filters) {
         _col_predicates.emplace_back(_parse_to_predicate(filter));
+        auto* pred = _col_predicates.back();
+        const auto& col = _tablet->tablet_schema()->column(pred->column_id());
+        auto is_like = is_like_predicate(pred);
+        auto* tablet_index = _tablet->tablet_schema()->get_ngram_bf_index(col.unique_id());
+
+        if (is_like && tablet_index && config::enable_query_like_bloom_filter) {
+            std::unique_ptr<segment_v2::BloomFilter> ng_bf;
+            std::string pattern = pred->get_search_str();
+            auto gram_bf_size = tablet_index->get_gram_bf_size();
+            auto gram_size = tablet_index->get_gram_size();
+
+            segment_v2::BloomFilter::create(segment_v2::NGRAM_BLOOM_FILTER, &ng_bf, gram_bf_size);
+            NgramTokenExtractor _token_extractor(gram_size);
+
+            if (_token_extractor.string_like_to_bloom_filter(pattern.data(), pattern.length(),
+                                                             *ng_bf)) {
+                pred->set_page_ng_bf(std::move(ng_bf));
+            }
+        }
     }
 }
 

diff --git a/be/src/olap/rowset/segment_v2/block_split_bloom_filter.h b/be/src/olap/rowset/segment_v2/block_split_bloom_filter.h
@@ -32,6 +32,7 @@ class BlockSplitBloomFilter : public BloomFilter {
     void add_hash(uint64_t hash) override;
 
     bool test_hash(uint64_t hash) const override;
+    bool contains(const BloomFilter&) const override { return true; }
 
 private:
     // Bytes in a tiny Bloom filter block.

diff --git a/be/src/olap/rowset/segment_v2/bloom_filter.cpp b/be/src/olap/rowset/segment_v2/bloom_filter.cpp
@@ -21,14 +21,18 @@
 #include "gen_cpp/segment_v2.pb.h"
 #include "gutil/strings/substitute.h"
 #include "olap/rowset/segment_v2/block_split_bloom_filter.h"
+#include "olap/rowset/segment_v2/ngram_bloom_filter.h"
 #include "olap/utils.h"
 
 namespace doris {
 namespace segment_v2 {
 
-Status BloomFilter::create(BloomFilterAlgorithmPB algorithm, std::unique_ptr<BloomFilter>* bf) {
+Status BloomFilter::create(BloomFilterAlgorithmPB algorithm, std::unique_ptr<BloomFilter>* bf,
+                           size_t bf_size) {
     if (algorithm == BLOCK_BLOOM_FILTER) {
         bf->reset(new BlockSplitBloomFilter());
+    } else if (algorithm == NGRAM_BLOOM_FILTER) {
+        bf->reset(new NGramBloomFilter(bf_size));
     } else {
         return Status::InternalError("invalid bloom filter algorithm:{}", algorithm);
     }