Skip to content

Commit

Permalink
[Optimize](invert index) Optimize multiple terms conjunction query (a…
Browse files Browse the repository at this point in the history
  • Loading branch information
zzzxl1993 authored Sep 8, 2023
1 parent 0f408d1 commit 153c798
Show file tree
Hide file tree
Showing 15 changed files with 450 additions and 154 deletions.
2 changes: 1 addition & 1 deletion be/src/clucene
Submodule clucene updated 31 files
+7 −0 src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
+2 −0 src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.h
+3 −4 src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
+2 −0 src/core/CLucene/analysis/AnalysisHeader.h
+3 −0 src/core/CLucene/analysis/Analyzers.h
+37 −0 src/core/CLucene/analysis/CharFilter.h
+2 −0 src/core/CLucene/analysis/standard95/StandardAnalyzer.h
+7 −0 src/core/CLucene/document/Field.h
+2 −1 src/core/CLucene/index/CodeMode.h
+4 −3 src/core/CLucene/index/DocRange.h
+1 −9 src/core/CLucene/index/DocumentsWriter.cpp
+28 −3 src/core/CLucene/index/FieldInfos.cpp
+4 −0 src/core/CLucene/index/IndexReader.cpp
+3 −0 src/core/CLucene/index/IndexReader.h
+8 −0 src/core/CLucene/index/IndexVersion.h
+23 −44 src/core/CLucene/index/IndexWriter.cpp
+24 −2 src/core/CLucene/index/MultiSegmentReader.cpp
+20 −40 src/core/CLucene/index/SDocumentWriter.cpp
+1 −0 src/core/CLucene/index/SDocumentWriter.h
+5 −0 src/core/CLucene/index/SegmentReader.cpp
+155 −203 src/core/CLucene/index/SegmentTermDocs.cpp
+4 −0 src/core/CLucene/index/Terms.h
+8 −0 src/core/CLucene/index/_FieldInfos.h
+4 −0 src/core/CLucene/index/_MultiSegmentReader.h
+69 −0 src/core/CLucene/index/_SegmentHeader.h
+16 −0 src/core/CLucene/search/query/DcoIdSetIterator.h
+51 −0 src/core/CLucene/search/query/TermIterator.h
+6 −2 src/core/CLucene/util/CLStreams.h
+75 −0 src/core/CLucene/util/PFORUtil.cpp
+27 −0 src/core/CLucene/util/PFORUtil.h
+3 −0 src/core/CMakeLists.txt
6 changes: 4 additions & 2 deletions be/src/olap/rowset/segment_v2/column_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include "olap/column_predicate.h"
#include "olap/decimal12.h"
#include "olap/inverted_index_parser.h"
#include "olap/iterators.h"
#include "olap/olap_common.h"
#include "olap/rowset/segment_v2/binary_dict_page.h" // for BinaryDictPageDecoder
#include "olap/rowset/segment_v2/binary_plain_page.h"
Expand Down Expand Up @@ -242,11 +243,12 @@ Status ColumnReader::new_bitmap_index_iterator(BitmapIndexIterator** iterator) {
}

Status ColumnReader::new_inverted_index_iterator(const TabletIndex* index_meta,
OlapReaderStatistics* stats,
const StorageReadOptions& read_options,
std::unique_ptr<InvertedIndexIterator>* iterator) {
RETURN_IF_ERROR(_ensure_inverted_index_loaded(index_meta));
if (_inverted_index) {
RETURN_IF_ERROR(_inverted_index->new_iterator(stats, iterator));
RETURN_IF_ERROR(_inverted_index->new_iterator(read_options.stats,
read_options.runtime_state, iterator));
}
return Status::OK();
}
Expand Down
4 changes: 3 additions & 1 deletion be/src/olap/rowset/segment_v2/column_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class WrapperField;
class AndBlockColumnPredicate;
class ColumnPredicate;
class TabletIndex;
class StorageReadOptions;

namespace io {
class FileReader;
Expand Down Expand Up @@ -119,7 +120,8 @@ class ColumnReader {
// Client should delete returned iterator
Status new_bitmap_index_iterator(BitmapIndexIterator** iterator);

Status new_inverted_index_iterator(const TabletIndex* index_meta, OlapReaderStatistics* stats,
Status new_inverted_index_iterator(const TabletIndex* index_meta,
const StorageReadOptions& read_options,
std::unique_ptr<InvertedIndexIterator>* iterator);

// Seek to the first entry in the column.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "conjunction_query.h"

#include <cstdint>

namespace doris {

ConjunctionQuery::ConjunctionQuery(IndexReader* reader)
: _reader(reader), _index_version(reader->getIndexVersion()) {}

ConjunctionQuery::~ConjunctionQuery() {
for (auto& term : _terms) {
if (term) {
_CLDELETE(term);
}
}
for (auto& term_doc : _term_docs) {
if (term_doc) {
_CLDELETE(term_doc);
}
}
}

void ConjunctionQuery::add(const std::wstring& field_name,
const std::vector<std::wstring>& wterms) {
if (wterms.size() < 1) {
_CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms.size() < 1");
}

std::vector<TermIterator> iterators;
for (auto& wterm : wterms) {
Term* t = _CLNEW Term(field_name.c_str(), wterm.c_str());
_terms.push_back(t);
TermDocs* term_doc = _reader->termDocs(t);
_term_docs.push_back(term_doc);
iterators.emplace_back(term_doc);
}

std::sort(iterators.begin(), iterators.end(), [](const TermIterator& a, const TermIterator& b) {
return a.docFreq() < b.docFreq();
});

if (iterators.size() == 1) {
_lead1 = iterators[0];
} else {
_lead1 = iterators[0];
_lead2 = iterators[1];
for (int32_t i = 2; i < _terms.size(); i++) {
_others.push_back(iterators[i]);
}
}

if (_index_version == IndexVersion::kV1 && iterators.size() >= 2) {
int32_t little = iterators[0].docFreq();
int32_t big = iterators[iterators.size() - 1].docFreq();
if (little == 0 || (big / little) > _conjunction_ratio) {
_use_skip = true;
}
}
}

void ConjunctionQuery::search(roaring::Roaring& roaring) {
if (_lead1.isEmpty()) {
return;
}

if (!_use_skip) {
search_by_bitmap(roaring);
return;
}

search_by_skiplist(roaring);
}

void ConjunctionQuery::search_by_bitmap(roaring::Roaring& roaring) {
// can get a term of all docid
auto func = [&roaring](const TermIterator& term_docs, bool first) {
roaring::Roaring result;
DocRange doc_range;
while (term_docs.readRange(&doc_range)) {
if (doc_range.type_ == DocRangeType::kMany) {
result.addMany(doc_range.doc_many_size_, doc_range.doc_many->data());
} else {
result.addRange(doc_range.doc_range.first, doc_range.doc_range.second);
}
}
if (first) {
roaring.swap(result);
} else {
roaring &= result;
}
};

// fill the bitmap for the first time
func(_lead1, true);

// the second inverted list may be empty
if (!_lead2.isEmpty()) {
func(_lead2, false);
}

// The inverted index iterators contained in the _others array must not be empty
for (auto& other : _others) {
func(other, false);
}
}

void ConjunctionQuery::search_by_skiplist(roaring::Roaring& roaring) {
int32_t doc = 0;
int32_t first_doc = _lead1.nextDoc();
while ((doc = do_next(first_doc)) != INT32_MAX) {
roaring.add(doc);
}
}

int32_t ConjunctionQuery::do_next(int32_t doc) {
while (true) {
assert(doc == _lead1.docID());

// the skip list is used to find the two smallest inverted lists
int32_t next2 = _lead2.advance(doc);
if (next2 != doc) {
doc = _lead1.advance(next2);
if (next2 != doc) {
continue;
}
}

// if both lead1 and lead2 exist, use skip list to lookup other inverted indexes
bool advance_head = false;
for (auto& other : _others) {
if (other.isEmpty()) {
continue;
}

if (other.docID() < doc) {
int32_t next = other.advance(doc);
if (next > doc) {
doc = _lead1.advance(next);
advance_head = true;
break;
}
}
}
if (advance_head) {
continue;
}

return doc;
}
}

} // namespace doris
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <CLucene.h>
#include <CLucene/index/IndexReader.h>
#include <CLucene/index/IndexVersion.h>
#include <CLucene/index/Term.h>
#include <CLucene/search/query/TermIterator.h>

#include "roaring/roaring.hh"

CL_NS_USE(index)

namespace doris {

class ConjunctionQuery {
public:
ConjunctionQuery(IndexReader* reader);
~ConjunctionQuery();

void set_conjunction_ratio(int32_t conjunction_ratio) {
_conjunction_ratio = conjunction_ratio;
}

void add(const std::wstring& field_name, const std::vector<std::wstring>& wterms);
void search(roaring::Roaring& roaring);

private:
void search_by_bitmap(roaring::Roaring& roaring);
void search_by_skiplist(roaring::Roaring& roaring);

int32_t do_next(int32_t doc);

IndexReader* _reader = nullptr;
IndexVersion _index_version = IndexVersion::kV0;
int32_t _conjunction_ratio = 1000;
bool _use_skip = false;

TermIterator _lead1;
TermIterator _lead2;
std::vector<TermIterator> _others;

std::vector<Term*> _terms;
std::vector<TermDocs*> _term_docs;
};

} // namespace doris
6 changes: 6 additions & 0 deletions be/src/olap/rowset/segment_v2/inverted_index_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,13 @@

#pragma once

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Woverloaded-virtual"

#include <CLucene.h> // IWYU pragma: keep

#pragma GCC diagnostic pop

#include <CLucene/config/repl_wchar.h>
#include <CLucene/util/Misc.h>
#include <butil/macros.h>
Expand Down
3 changes: 2 additions & 1 deletion be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ Status compact_column(int32_t index_id, int src_segment_num, int dest_segment_nu
std::vector<uint32_t> dest_segment_num_rows) {
lucene::store::Directory* dir =
DorisCompoundDirectory::getDirectory(fs, index_writer_path.c_str(), false);
auto index_writer = _CLNEW lucene::index::IndexWriter(dir, nullptr, true /* create */,
lucene::analysis::SimpleAnalyzer<char> analyzer;
auto index_writer = _CLNEW lucene::index::IndexWriter(dir, &analyzer, true /* create */,
true /* closeDirOnShutdown */);

// get compound directory src_index_dirs
Expand Down
Loading

0 comments on commit 153c798

Please sign in to comment.