Skip to content

Commit

Permalink
[Feature](NGram BloomFilter Index) add new ngram bloom filter index t…
Browse files Browse the repository at this point in the history
…o speed up like query (apache#11579)

This PR implement  the new bloom filter index: NGram bloom filter index, which was proposed in  apache#10733.
The new index can improve the like query performance greatly, from our some test case , can  get order of magnitude  improve.
For how to use it you can check the docs in this PR, and the index based on the ```enable_function_pushdown```,
you need set it to ```true```, to make the index work for like query.
  • Loading branch information
compasses authored Dec 28, 2022
1 parent 0f8b15b commit 75aa00d
Show file tree
Hide file tree
Showing 44 changed files with 1,720 additions and 27 deletions.
4 changes: 4 additions & 0 deletions .clang-format-ignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@ be/src/util/sse2neon.h
be/src/util/mustache/mustache.h
be/src/util/mustache/mustache.cc
be/src/util/utf8_check.cpp
be/src/util/cityhash102/city.h
be/src/util/cityhash102/city.cc
be/src/util/cityhash102/citycrc.h
be/src/util/cityhash102/config.h
1 change: 1 addition & 0 deletions .licenserc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ header:
- "be/src/util/sse2neo.h"
- "be/src/util/sse2neon.h"
- "be/src/util/utf8_check.cpp"
- "be/src/util/cityhash102"
- "build-support/run_clang_format.py"
- "regression-test/data"
- "docs/.vuepress/public/css/animate.min.css"
Expand Down
1 change: 1 addition & 0 deletions be/src/common/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -826,6 +826,7 @@ CONF_Int32(s3_transfer_executor_pool_size, "2");
CONF_Bool(enable_time_lut, "true");
CONF_Bool(enable_simdjson_reader, "false");

CONF_mBool(enable_query_like_bloom_filter, "true");
// number of s3 scanner thread pool size
CONF_Int32(doris_remote_scanner_thread_pool_thread_num, "16");
// number of s3 scanner thread pool queue size
Expand Down
2 changes: 2 additions & 0 deletions be/src/olap/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ add_library(Olap STATIC
file_helper.cpp
hll.cpp
inverted_index_parser.cpp
itoken_extractor.cpp
like_column_predicate.cpp
key_coder.cpp
lru_cache.cpp
Expand Down Expand Up @@ -93,6 +94,7 @@ add_library(Olap STATIC
rowset/segment_v2/empty_segment_iterator.cpp
rowset/segment_v2/segment_writer.cpp
rowset/segment_v2/block_split_bloom_filter.cpp
rowset/segment_v2/ngram_bloom_filter.cpp
rowset/segment_v2/bloom_filter_index_reader.cpp
rowset/segment_v2/bloom_filter_index_writer.cpp
rowset/segment_v2/bloom_filter.cpp
Expand Down
9 changes: 9 additions & 0 deletions be/src/olap/column_predicate.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,15 @@ class ColumnPredicate {
bool* flags) const {
DCHECK(false) << "should not reach here";
}

virtual std::string get_search_str() const {
DCHECK(false) << "should not reach here";
return "";
}

virtual void set_page_ng_bf(std::unique_ptr<segment_v2::BloomFilter>) {
DCHECK(false) << "should not reach here";
}
uint32_t column_id() const { return _column_id; }

virtual std::string debug_string() const {
Expand Down
77 changes: 77 additions & 0 deletions be/src/olap/itoken_extractor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "itoken_extractor.h"

#include "util/simd/vstring_function.h"

namespace doris {

bool NgramTokenExtractor::next_in_string(const char* data, size_t length, size_t* __restrict pos,
size_t* __restrict token_start,
size_t* __restrict token_length) const {
*token_start = *pos;
*token_length = 0;
size_t code_points = 0;
for (; code_points < n && *token_start + *token_length < length; ++code_points) {
size_t sz = get_utf8_byte_length(static_cast<uint8_t>(data[*token_start + *token_length]));
*token_length += sz;
}
*pos += get_utf8_byte_length(static_cast<uint8_t>(data[*pos]));
return code_points == n;
}

bool NgramTokenExtractor::next_in_string_like(const char* data, size_t length, size_t* pos,
std::string& token) const {
token.clear();

size_t code_points = 0;
bool escaped = false;
for (size_t i = *pos; i < length;) {
if (escaped && (data[i] == '%' || data[i] == '_' || data[i] == '\\')) {
token += data[i];
++code_points;
escaped = false;
++i;
} else if (!escaped && (data[i] == '%' || data[i] == '_')) {
/// This token is too small, go to the next.
token.clear();
code_points = 0;
escaped = false;
*pos = ++i;
} else if (!escaped && data[i] == '\\') {
escaped = true;
++i;
} else {
const size_t sz = get_utf8_byte_length(static_cast<uint8_t>(data[i]));
for (size_t j = 0; j < sz; ++j) {
token += data[i + j];
}
i += sz;
++code_points;
escaped = false;
}

if (code_points == n) {
*pos += get_utf8_byte_length(static_cast<uint8_t>(data[*pos]));
return true;
}
}

return false;
}
} // namespace doris
98 changes: 98 additions & 0 deletions be/src/olap/itoken_extractor.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#ifndef DORIS_ITOKEN_EXTRACTOR_H
#define DORIS_ITOKEN_EXTRACTOR_H

#include <stddef.h>

#include <string>

#include "olap/rowset/segment_v2/bloom_filter.h"

namespace doris {

/// Interface for string parsers.
struct ITokenExtractor {
virtual ~ITokenExtractor() = default;

/// Fast inplace implementation for regular use.
/// Gets string (data ptr and len) and start position for extracting next token (state of extractor).
/// Returns false if parsing is finished, otherwise returns true.
virtual bool next_in_string(const char* data, size_t length, size_t* __restrict pos,
size_t* __restrict token_start,
size_t* __restrict token_length) const = 0;

/// Special implementation for creating bloom filter for LIKE function.
/// It skips unescaped `%` and `_` and supports escaping symbols, but it is less lightweight.
virtual bool next_in_string_like(const char* data, size_t length, size_t* pos,
std::string& out) const = 0;

virtual void string_to_bloom_filter(const char* data, size_t length,
segment_v2::BloomFilter& bloom_filter) const = 0;

virtual bool string_like_to_bloom_filter(const char* data, size_t length,
segment_v2::BloomFilter& bloom_filter) const = 0;
};

template <typename Derived>
class ITokenExtractorHelper : public ITokenExtractor {
public:
void string_to_bloom_filter(const char* data, size_t length,
segment_v2::BloomFilter& bloom_filter) const override {
size_t cur = 0;
size_t token_start = 0;
size_t token_len = 0;

while (cur < length && static_cast<const Derived*>(this)->next_in_string(
data, length, &cur, &token_start, &token_len))
bloom_filter.add_bytes(data + token_start, token_len);
}

bool string_like_to_bloom_filter(const char* data, size_t length,
segment_v2::BloomFilter& bloom_filter) const override {
size_t cur = 0;
bool added = false;
std::string token;
while (cur < length &&
static_cast<const Derived*>(this)->next_in_string_like(data, length, &cur, token)) {
bloom_filter.add_bytes(token.data(), token.size());
added = true;
}

return added;
}
};

/// Parser extracting all ngrams from string.
struct NgramTokenExtractor final : public ITokenExtractorHelper<NgramTokenExtractor> {
public:
explicit NgramTokenExtractor(size_t n_) : n(n_) {}

bool next_in_string(const char* data, size_t length, size_t* __restrict pos,
size_t* __restrict token_start,
size_t* __restrict token_length) const override;

bool next_in_string_like(const char* data, size_t length, size_t* pos,
std::string& token) const override;

private:
size_t n;
};
} // namespace doris

#endif //DORIS_ITOKEN_EXTRACTOR_H
24 changes: 21 additions & 3 deletions be/src/olap/like_column_predicate.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,22 @@ class LikeColumnPredicate : public ColumnPredicate {
void evaluate_and_vec(const vectorized::IColumn& column, uint16_t size,
bool* flags) const override;

std::string get_search_str() const override {
return std::string(reinterpret_cast<char*>(pattern.ptr), pattern.len);
}
bool is_opposite() const { return _opposite; }

void set_page_ng_bf(std::unique_ptr<segment_v2::BloomFilter> src) override {
_page_ng_bf = std::move(src);
}
bool evaluate_and(const BloomFilter* bf) const override {
if (_page_ng_bf) {
return bf->contains(*_page_ng_bf);
}
return true;
}
bool can_do_bloom_filter() const override { return true; }

private:
template <bool is_and>
void _evaluate_vec(const vectorized::IColumn& column, uint16_t size, bool* flags) const {
Expand Down Expand Up @@ -130,9 +146,11 @@ class LikeColumnPredicate : public ColumnPredicate {

StateType* _state;

// A separate scratch region is required for every concurrent caller of the Hyperscan API.
// So here _like_state is separate for each instance of LikeColumnPredicate.
// A separate scratch region is required for every concurrent caller of the
// Hyperscan API. So here _like_state is separate for each instance of
// LikeColumnPredicate.
vectorized::LikeSearchState _like_state;
std::unique_ptr<segment_v2::BloomFilter> _page_ng_bf; // for ngram-bf index
};

} //namespace doris
} // namespace doris
33 changes: 33 additions & 0 deletions be/src/olap/reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@
#include "common/status.h"
#include "exprs/create_predicate_function.h"
#include "exprs/hybrid_set.h"
#include "gen_cpp/segment_v2.pb.h"
#include "olap/bloom_filter_predicate.h"
#include "olap/comparison_predicate.h"
#include "olap/in_list_predicate.h"
#include "olap/itoken_extractor.h"
#include "olap/like_column_predicate.h"
#include "olap/olap_common.h"
#include "olap/predicate_creator.h"
Expand Down Expand Up @@ -463,8 +468,36 @@ void TabletReader::_init_conditions_param(const ReaderParams& read_params) {
}

// Function filter push down to storage engine
auto is_like_predicate = [](ColumnPredicate* _pred) {
if (dynamic_cast<LikeColumnPredicate<false>*>(_pred) ||
dynamic_cast<LikeColumnPredicate<true>*>(_pred)) {
return true;
}

return false;
};

for (const auto& filter : read_params.function_filters) {
_col_predicates.emplace_back(_parse_to_predicate(filter));
auto* pred = _col_predicates.back();
const auto& col = _tablet->tablet_schema()->column(pred->column_id());
auto is_like = is_like_predicate(pred);
auto* tablet_index = _tablet->tablet_schema()->get_ngram_bf_index(col.unique_id());

if (is_like && tablet_index && config::enable_query_like_bloom_filter) {
std::unique_ptr<segment_v2::BloomFilter> ng_bf;
std::string pattern = pred->get_search_str();
auto gram_bf_size = tablet_index->get_gram_bf_size();
auto gram_size = tablet_index->get_gram_size();

segment_v2::BloomFilter::create(segment_v2::NGRAM_BLOOM_FILTER, &ng_bf, gram_bf_size);
NgramTokenExtractor _token_extractor(gram_size);

if (_token_extractor.string_like_to_bloom_filter(pattern.data(), pattern.length(),
*ng_bf)) {
pred->set_page_ng_bf(std::move(ng_bf));
}
}
}
}

Expand Down
1 change: 1 addition & 0 deletions be/src/olap/rowset/segment_v2/block_split_bloom_filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class BlockSplitBloomFilter : public BloomFilter {
void add_hash(uint64_t hash) override;

bool test_hash(uint64_t hash) const override;
bool contains(const BloomFilter&) const override { return true; }

private:
// Bytes in a tiny Bloom filter block.
Expand Down
6 changes: 5 additions & 1 deletion be/src/olap/rowset/segment_v2/bloom_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,18 @@
#include "gen_cpp/segment_v2.pb.h"
#include "gutil/strings/substitute.h"
#include "olap/rowset/segment_v2/block_split_bloom_filter.h"
#include "olap/rowset/segment_v2/ngram_bloom_filter.h"
#include "olap/utils.h"

namespace doris {
namespace segment_v2 {

Status BloomFilter::create(BloomFilterAlgorithmPB algorithm, std::unique_ptr<BloomFilter>* bf) {
Status BloomFilter::create(BloomFilterAlgorithmPB algorithm, std::unique_ptr<BloomFilter>* bf,
size_t bf_size) {
if (algorithm == BLOCK_BLOOM_FILTER) {
bf->reset(new BlockSplitBloomFilter());
} else if (algorithm == NGRAM_BLOOM_FILTER) {
bf->reset(new NGramBloomFilter(bf_size));
} else {
return Status::InternalError("invalid bloom filter algorithm:{}", algorithm);
}
Expand Down
Loading

0 comments on commit 75aa00d

Please sign in to comment.