Skip to content

Commit

Permalink
[fix](index compaction)support compact multi segments in one index ap…
Browse files Browse the repository at this point in the history
  • Loading branch information
qidaye authored Dec 29, 2023
1 parent 07de412 commit 6495eb2
Show file tree
Hide file tree
Showing 10 changed files with 544 additions and 12 deletions.
2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@
[submodule "be/src/clucene"]
path = be/src/clucene
url = https://github.com/apache/doris-thirdparty.git
branch = clucene
branch = clucene-2.0
5 changes: 4 additions & 1 deletion be/src/common/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -996,14 +996,17 @@ DEFINE_String(inverted_index_query_cache_limit, "10%");

// inverted index
DEFINE_mDouble(inverted_index_ram_buffer_size, "512");
// -1 indicates not working.
// Normally we should not change this, it's useful for testing.
DEFINE_mInt32(inverted_index_max_buffered_docs, "-1");
DEFINE_Int32(query_bkd_inverted_index_limit_percent, "5"); // 5%
// dict path for chinese analyzer
DEFINE_String(inverted_index_dict_path, "${DORIS_HOME}/dict");
DEFINE_Int32(inverted_index_read_buffer_size, "4096");
// tree depth for bkd index
DEFINE_Int32(max_depth_in_bkd_tree, "32");
// index compaction
DEFINE_Bool(inverted_index_compaction_enable, "false");
DEFINE_mBool(inverted_index_compaction_enable, "false");
// use num_broadcast_buffer blocks as buffer to do broadcast
DEFINE_Int32(num_broadcast_buffer, "32");
// semi-structure configs
Expand Down
3 changes: 2 additions & 1 deletion be/src/common/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -1037,10 +1037,11 @@ DECLARE_Int32(query_bkd_inverted_index_limit_percent); // 5%
// dict path for chinese analyzer
DECLARE_String(inverted_index_dict_path);
DECLARE_Int32(inverted_index_read_buffer_size);
DECLARE_mInt32(inverted_index_max_buffered_docs);
// tree depth for bkd index
DECLARE_Int32(max_depth_in_bkd_tree);
// index compaction
DECLARE_Bool(inverted_index_compaction_enable);
DECLARE_mBool(inverted_index_compaction_enable);
// use num_broadcast_buffer blocks as buffer to do broadcast
DECLARE_Int32(num_broadcast_buffer);
// semi-structure configs
Expand Down
38 changes: 33 additions & 5 deletions be/src/olap/compaction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,34 @@ Status Compaction::do_compaction_impl(int64_t permits) {

if (_input_row_num > 0 && stats.rowid_conversion && config::inverted_index_compaction_enable) {
OlapStopWatch inverted_watch;

// check rowid_conversion correctness
Version version = _tablet->max_version();
DeleteBitmap output_rowset_delete_bitmap(_tablet->tablet_id());
std::set<RowLocation> missed_rows;
std::map<RowsetSharedPtr, std::list<std::pair<RowLocation, RowLocation>>> location_map;
// Convert the delete bitmap of the input rowsets to output rowset.
std::size_t missed_rows_size = 0;
_tablet->calc_compaction_output_rowset_delete_bitmap(
_input_rowsets, _rowid_conversion, 0, version.second + 1, &missed_rows,
&location_map, _tablet->tablet_meta()->delete_bitmap(),
&output_rowset_delete_bitmap);
if (!allow_delete_in_cumu_compaction()) {
missed_rows_size = missed_rows.size();
if (compaction_type() == ReaderType::READER_CUMULATIVE_COMPACTION &&
stats.merged_rows != missed_rows_size) {
std::string err_msg = fmt::format(
"cumulative compaction: the merged rows({}) is not equal to missed "
"rows({}) in rowid conversion, tablet_id: {}, table_id:{}",
stats.merged_rows, missed_rows_size, _tablet->tablet_id(),
_tablet->table_id());
DCHECK(false) << err_msg;
LOG(WARNING) << err_msg;
}
}

RETURN_IF_ERROR(_tablet->check_rowid_conversion(_output_rowset, location_map));

// translation vec
// <<dest_idx_num, dest_docId>>
// the first level vector: index indicates src segment.
Expand All @@ -428,7 +456,7 @@ Status Compaction::do_compaction_impl(int64_t permits) {
// src index files
// format: rowsetId_segmentId
std::vector<std::string> src_index_files(src_segment_num);
for (auto m : src_seg_to_id_map) {
for (const auto& m : src_seg_to_id_map) {
std::pair<RowsetId, uint32_t> p = m.first;
src_index_files[m.second] = p.first.to_string() + "_" + std::to_string(p.second);
}
Expand Down Expand Up @@ -677,11 +705,11 @@ Status Compaction::modify_rowsets(const Merger::Statistics* stats) {
// of incremental data later.
// TODO(LiaoXin): check if there are duplicate keys
std::size_t missed_rows_size = 0;
_tablet->calc_compaction_output_rowset_delete_bitmap(
_input_rowsets, _rowid_conversion, 0, version.second + 1, &missed_rows,
&location_map, _tablet->tablet_meta()->delete_bitmap(),
&output_rowset_delete_bitmap);
if (!allow_delete_in_cumu_compaction()) {
_tablet->calc_compaction_output_rowset_delete_bitmap(
_input_rowsets, _rowid_conversion, 0, version.second + 1, &missed_rows,
&location_map, _tablet->tablet_meta()->delete_bitmap(),
&output_rowset_delete_bitmap);
missed_rows_size = missed_rows.size();
if (compaction_type() == ReaderType::READER_CUMULATIVE_COMPACTION && stats != nullptr &&
stats->merged_rows != missed_rows_size) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ Status compact_column(int32_t index_id, int src_segment_num, int dest_segment_nu
dest_index_dirs[i] = DorisCompoundDirectory::getDirectory(fs, path.c_str(), true);
}

DCHECK_EQ(src_index_dirs.size(), trans_vec.size());
index_writer->indexCompaction(src_index_dirs, dest_index_dirs, trans_vec,
dest_segment_num_rows);

Expand Down
3 changes: 1 addition & 2 deletions be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@

namespace doris::segment_v2 {
const int32_t MAX_FIELD_LEN = 0x7FFFFFFFL;
const int32_t MAX_BUFFER_DOCS = 100000000;
const int32_t MERGE_FACTOR = 100000000;
const int32_t MAX_LEAF_COUNT = 1024;
const float MAXMBSortInHeap = 512.0 * 8;
Expand Down Expand Up @@ -193,8 +192,8 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
}
_index_writer = std::make_unique<lucene::index::IndexWriter>(_dir.get(), _analyzer.get(),
create, true);
_index_writer->setMaxBufferedDocs(MAX_BUFFER_DOCS);
_index_writer->setRAMBufferSizeMB(config::inverted_index_ram_buffer_size);
_index_writer->setMaxBufferedDocs(config::inverted_index_max_buffered_docs);
_index_writer->setMaxFieldLength(MAX_FIELD_LEN);
_index_writer->setMergeFactor(MERGE_FACTOR);
_index_writer->setUseCompoundFile(false);
Expand Down
2 changes: 1 addition & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ update_submodule() {
}

update_submodule "be/src/apache-orc" "apache-orc" "https://github.com/apache/doris-thirdparty/archive/refs/heads/orc.tar.gz"
update_submodule "be/src/clucene" "clucene" "https://github.com/apache/doris-thirdparty/archive/refs/heads/clucene.tar.gz"
update_submodule "be/src/clucene" "clucene" "https://github.com/apache/doris-thirdparty/archive/refs/heads/clucene-2.0.tar.gz"

if [[ "${CLEAN}" -eq 1 && "${BUILD_BE}" -eq 0 && "${BUILD_FE}" -eq 0 && "${BUILD_SPARK_DPP}" -eq 0 ]]; then
clean_gensrc
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql --
2018-02-21T12:00 1 I'm using the builds
2018-02-21T12:00 1 I'm using the builds
2018-02-21T12:00 2 I'm using the builds
2018-02-21T12:00 2 I'm using the builds
2018-02-21T12:00 3 I'm using the builds
2018-02-21T12:00 3 I'm using the builds
2018-02-21T12:00 4 I'm using the builds
2018-02-21T12:00 4 I'm using the builds
2018-02-21T12:00 5 I'm using the builds
2018-02-21T12:00 5 I'm using the builds
2018-02-21T12:00 6 I'm using the builds
2018-02-21T12:00 6 I'm using the builds
2018-02-21T12:00 7 I'm using the builds
2018-02-21T12:00 7 I'm using the builds
2018-02-21T12:00 8 I'm using the builds
2018-02-21T12:00 8 I'm using the builds
2018-02-21T12:00 9 I'm using the builds
2018-02-21T12:00 9 I'm using the builds
2018-02-21T12:00 10 I'm using the builds
2018-02-21T12:00 10 I'm using the builds

-- !sql --
2018-02-21T12:00 1 I'm using the builds
2018-02-21T12:00 1 I'm using the builds
2018-02-21T12:00 2 I'm using the builds
2018-02-21T12:00 2 I'm using the builds
2018-02-21T12:00 3 I'm using the builds
2018-02-21T12:00 3 I'm using the builds
2018-02-21T12:00 4 I'm using the builds
2018-02-21T12:00 4 I'm using the builds
2018-02-21T12:00 5 I'm using the builds
2018-02-21T12:00 5 I'm using the builds
2018-02-21T12:00 6 I'm using the builds
2018-02-21T12:00 6 I'm using the builds
2018-02-21T12:00 7 I'm using the builds
2018-02-21T12:00 7 I'm using the builds
2018-02-21T12:00 8 I'm using the builds
2018-02-21T12:00 8 I'm using the builds
2018-02-21T12:00 9 I'm using the builds
2018-02-21T12:00 9 I'm using the builds
2018-02-21T12:00 10 I'm using the builds
2018-02-21T12:00 10 I'm using the builds

-- !sql --

-- !sql --
2018-02-21T12:00 1 I'm using the builds
2018-02-21T12:00 1 I'm using the builds
2018-02-21T12:00 2 I'm using the builds
2018-02-21T12:00 2 I'm using the builds
2018-02-21T12:00 3 I'm using the builds
2018-02-21T12:00 3 I'm using the builds
2018-02-21T12:00 4 I'm using the builds
2018-02-21T12:00 4 I'm using the builds
2018-02-21T12:00 5 I'm using the builds
2018-02-21T12:00 5 I'm using the builds
2018-02-21T12:00 6 I'm using the builds
2018-02-21T12:00 6 I'm using the builds
2018-02-21T12:00 7 I'm using the builds
2018-02-21T12:00 7 I'm using the builds

-- !sql --
2018-02-21T12:00 1 I'm using the builds
2018-02-21T12:00 1 I'm using the builds
2018-02-21T12:00 2 I'm using the builds
2018-02-21T12:00 2 I'm using the builds
2018-02-21T12:00 3 I'm using the builds
2018-02-21T12:00 3 I'm using the builds
2018-02-21T12:00 4 I'm using the builds
2018-02-21T12:00 4 I'm using the builds
2018-02-21T12:00 5 I'm using the builds
2018-02-21T12:00 5 I'm using the builds
2018-02-21T12:00 6 I'm using the builds
2018-02-21T12:00 6 I'm using the builds
2018-02-21T12:00 7 I'm using the builds
2018-02-21T12:00 7 I'm using the builds
2018-02-21T12:00 8 I'm using the builds
2018-02-21T12:00 8 I'm using the builds
2018-02-21T12:00 9 I'm using the builds
2018-02-21T12:00 9 I'm using the builds
2018-02-21T12:00 10 I'm using the builds
2018-02-21T12:00 10 I'm using the builds

-- !sql --
2018-02-21T12:00 1 I'm using the builds
2018-02-21T12:00 1 I'm using the builds
2018-02-21T12:00 2 I'm using the builds
2018-02-21T12:00 2 I'm using the builds
2018-02-21T12:00 3 I'm using the builds
2018-02-21T12:00 3 I'm using the builds
2018-02-21T12:00 4 I'm using the builds
2018-02-21T12:00 4 I'm using the builds
2018-02-21T12:00 5 I'm using the builds
2018-02-21T12:00 5 I'm using the builds
2018-02-21T12:00 6 I'm using the builds
2018-02-21T12:00 6 I'm using the builds
2018-02-21T12:00 7 I'm using the builds
2018-02-21T12:00 7 I'm using the builds
2018-02-21T12:00 8 I'm using the builds
2018-02-21T12:00 8 I'm using the builds
2018-02-21T12:00 9 I'm using the builds
2018-02-21T12:00 9 I'm using the builds
2018-02-21T12:00 10 I'm using the builds
2018-02-21T12:00 10 I'm using the builds

-- !sql --

-- !sql --
2018-02-21T12:00 1 I'm using the builds
2018-02-21T12:00 1 I'm using the builds
2018-02-21T12:00 2 I'm using the builds
2018-02-21T12:00 2 I'm using the builds
2018-02-21T12:00 3 I'm using the builds
2018-02-21T12:00 3 I'm using the builds
2018-02-21T12:00 4 I'm using the builds
2018-02-21T12:00 4 I'm using the builds
2018-02-21T12:00 5 I'm using the builds
2018-02-21T12:00 5 I'm using the builds
2018-02-21T12:00 6 I'm using the builds
2018-02-21T12:00 6 I'm using the builds
2018-02-21T12:00 7 I'm using the builds
2018-02-21T12:00 7 I'm using the builds

-- !sql --
2018-02-21T12:00 1 I'm using the builds
2018-02-21T12:00 1 I'm using the builds
2018-02-21T12:00 1 I'm using the builds
2018-02-21T12:00 2 I'm using the builds
2018-02-21T12:00 2 I'm using the builds
2018-02-21T12:00 2 I'm using the builds
2018-02-21T12:00 3 I'm using the builds
2018-02-21T12:00 3 I'm using the builds
2018-02-21T12:00 3 I'm using the builds
2018-02-21T12:00 4 I'm using the builds
2018-02-21T12:00 4 I'm using the builds
2018-02-21T12:00 4 I'm using the builds
2018-02-21T12:00 5 I'm using the builds
2018-02-21T12:00 5 I'm using the builds
2018-02-21T12:00 5 I'm using the builds
2018-02-21T12:00 6 I'm using the builds
2018-02-21T12:00 6 I'm using the builds
2018-02-21T12:00 6 I'm using the builds
2018-02-21T12:00 7 I'm using the builds
2018-02-21T12:00 7 I'm using the builds
2018-02-21T12:00 7 I'm using the builds
2018-02-21T12:00 8 I'm using the builds
2018-02-21T12:00 8 I'm using the builds
2018-02-21T12:00 8 I'm using the builds
2018-02-21T12:00 9 I'm using the builds
2018-02-21T12:00 9 I'm using the builds
2018-02-21T12:00 9 I'm using the builds
2018-02-21T12:00 10 I'm using the builds
2018-02-21T12:00 10 I'm using the builds
2018-02-21T12:00 10 I'm using the builds

-- !sql --
2018-02-21T12:00 1 I'm using the builds
2018-02-21T12:00 1 I'm using the builds
2018-02-21T12:00 1 I'm using the builds
2018-02-21T12:00 2 I'm using the builds
2018-02-21T12:00 2 I'm using the builds
2018-02-21T12:00 2 I'm using the builds
2018-02-21T12:00 3 I'm using the builds
2018-02-21T12:00 3 I'm using the builds
2018-02-21T12:00 3 I'm using the builds
2018-02-21T12:00 4 I'm using the builds
2018-02-21T12:00 4 I'm using the builds
2018-02-21T12:00 4 I'm using the builds
2018-02-21T12:00 5 I'm using the builds
2018-02-21T12:00 5 I'm using the builds
2018-02-21T12:00 5 I'm using the builds
2018-02-21T12:00 6 I'm using the builds
2018-02-21T12:00 6 I'm using the builds
2018-02-21T12:00 6 I'm using the builds
2018-02-21T12:00 7 I'm using the builds
2018-02-21T12:00 7 I'm using the builds
2018-02-21T12:00 7 I'm using the builds
2018-02-21T12:00 8 I'm using the builds
2018-02-21T12:00 8 I'm using the builds
2018-02-21T12:00 8 I'm using the builds
2018-02-21T12:00 9 I'm using the builds
2018-02-21T12:00 9 I'm using the builds
2018-02-21T12:00 9 I'm using the builds
2018-02-21T12:00 10 I'm using the builds
2018-02-21T12:00 10 I'm using the builds
2018-02-21T12:00 10 I'm using the builds

-- !sql --

-- !sql --
2018-02-21T12:00 1 I'm using the builds
2018-02-21T12:00 1 I'm using the builds
2018-02-21T12:00 1 I'm using the builds
2018-02-21T12:00 2 I'm using the builds
2018-02-21T12:00 2 I'm using the builds
2018-02-21T12:00 2 I'm using the builds
2018-02-21T12:00 3 I'm using the builds
2018-02-21T12:00 3 I'm using the builds
2018-02-21T12:00 3 I'm using the builds
2018-02-21T12:00 4 I'm using the builds
2018-02-21T12:00 4 I'm using the builds
2018-02-21T12:00 4 I'm using the builds
2018-02-21T12:00 5 I'm using the builds
2018-02-21T12:00 5 I'm using the builds
2018-02-21T12:00 5 I'm using the builds
2018-02-21T12:00 6 I'm using the builds
2018-02-21T12:00 6 I'm using the builds
2018-02-21T12:00 6 I'm using the builds
2018-02-21T12:00 7 I'm using the builds
2018-02-21T12:00 7 I'm using the builds
2018-02-21T12:00 7 I'm using the builds

Loading

0 comments on commit 6495eb2

Please sign in to comment.