Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Enhancement] Rank topn prune redundant rows #8108

Merged
merged 6 commits into from
Jul 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 21 additions & 21 deletions be/src/exec/vectorized/chunks_sorter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,60 +50,60 @@ void DataSegment::init(const std::vector<ExprContext*>* sort_exprs, const ChunkP
Status DataSegment::get_filter_array(std::vector<DataSegment>& data_segments, size_t rows_to_sort,
std::vector<std::vector<uint8_t>>& filter_array,
const std::vector<int>& sort_order_flags, const std::vector<int>& null_first_flags,
uint32_t& least_num, uint32_t& middle_num) {
uint32_t& smaller_num, uint32_t& include_num) {
size_t dats_segment_size = data_segments.size();
std::vector<CompareVector> compare_results_array(dats_segment_size);

// first compare with last row of this chunk.
// First compare the chunk with last row of this segment.
{
get_compare_results_colwise(rows_to_sort - 1, order_by_columns, compare_results_array, data_segments,
sort_order_flags, null_first_flags);
}

// but we only have one compare.
// compare with first row of this DataSegment,
// then we set BEFORE_LAST_RESULT and IN_LAST_RESULT at filter_array.
// Since the first and the last of segment is the same value,
// we can get both `SMALLER_THAN_MIN_OF_SEGMENT` and `INCLUDE_IN_SEGMENT` parts
// with only one comparation
if (rows_to_sort == 1) {
least_num = 0, middle_num = 0;
smaller_num = 0, include_num = 0;
filter_array.resize(dats_segment_size);
for (size_t i = 0; i < dats_segment_size; ++i) {
size_t rows = data_segments[i].chunk->num_rows();
filter_array[i].resize(rows);

for (size_t j = 0; j < rows; ++j) {
if (compare_results_array[i][j] < 0) {
filter_array[i][j] = DataSegment::BEFORE_LAST_RESULT;
++least_num;
filter_array[i][j] = DataSegment::SMALLER_THAN_MIN_OF_SEGMENT;
++smaller_num;
} else {
filter_array[i][j] = DataSegment::IN_LAST_RESULT;
++middle_num;
filter_array[i][j] = DataSegment::INCLUDE_IN_SEGMENT;
++include_num;
}
}
}
} else {
std::vector<size_t> first_size_array;
first_size_array.resize(dats_segment_size);

middle_num = 0;
include_num = 0;
filter_array.resize(dats_segment_size);
for (size_t i = 0; i < dats_segment_size; ++i) {
DataSegment& segment = data_segments[i];
size_t rows = segment.chunk->num_rows();
filter_array[i].resize(rows);

size_t local_first_size = middle_num;
size_t local_first_size = include_num;
for (size_t j = 0; j < rows; ++j) {
if (compare_results_array[i][j] <= 0) {
filter_array[i][j] = DataSegment::IN_LAST_RESULT;
++middle_num;
filter_array[i][j] = DataSegment::INCLUDE_IN_SEGMENT;
++include_num;
}
}

// obtain number of rows for second compare.
first_size_array[i] = middle_num - local_first_size;
// Obtain number of rows for second compare.
first_size_array[i] = include_num - local_first_size;
}

// second compare with first row of this chunk, use rows from first compare.
// Second compare with first row of this chunk, use rows from first compare.
{
for (size_t i = 0; i < dats_segment_size; i++) {
for (auto& cmp : compare_results_array[i]) {
Expand All @@ -116,19 +116,19 @@ Status DataSegment::get_filter_array(std::vector<DataSegment>& data_segments, si
null_first_flags);
}

least_num = 0;
smaller_num = 0;
for (size_t i = 0; i < dats_segment_size; ++i) {
DataSegment& segment = data_segments[i];
size_t rows = segment.chunk->num_rows();

for (size_t j = 0; j < rows; ++j) {
if (compare_results_array[i][j] < 0) {
filter_array[i][j] = DataSegment::BEFORE_LAST_RESULT;
++least_num;
filter_array[i][j] = DataSegment::SMALLER_THAN_MIN_OF_SEGMENT;
++smaller_num;
}
}
}
middle_num -= least_num;
include_num -= smaller_num;
}

return Status::OK();
Expand Down
17 changes: 8 additions & 9 deletions be/src/exec/vectorized/chunks_sorter.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@
namespace starrocks::vectorized {

struct DataSegment {
static const uint8_t BEFORE_LAST_RESULT = 2;
static const uint8_t IN_LAST_RESULT = 1;
static const uint8_t SMALLER_THAN_MIN_OF_SEGMENT = 2;
static const uint8_t INCLUDE_IN_SEGMENT = 1;
static const uint8_t LARGER_THAN_MAX_OF_SEGMENT = 0;

ChunkPtr chunk;
Columns order_by_columns;
Expand All @@ -27,15 +28,13 @@ struct DataSegment {

void init(const std::vector<ExprContext*>* sort_exprs, const ChunkPtr& cnk);

// there is two compares in the method,
// There is two compares in the method,
// the first is:
// compare every row in every DataSegment of data_segments with rows_to_sort - 1 row of this DataSegment,
// obtain every row compare result in compare_results_array, if < 0, use it to set IN at filter_array.
// compare every row in every DataSegment of data_segments with `rows_to_sort - 1` row of this DataSegment,
// obtain every row compare result in compare_results_array, if <= 0, mark it with `INCLUDE_IN_SEGMENT`.
// the second is:
// compare every row in compare_results_array that less than 0, use it to compare with first row of this DataSegment,
// as the first step, we set BEFORE_LAST_RESULT at filter_array.
//
// Actually, we Count the results in the first compare for the second compare.
// compare every row in compare_results_array that <= 0 (i.e. `INCLUDE_IN_SEGMENT` part) with the first row of this DataSegment,
// if < 0, then mark it with `SMALLER_THAN_MIN_OF_SEGMENT`
Status get_filter_array(std::vector<DataSegment>& data_segments, size_t rows_to_sort,
std::vector<std::vector<uint8_t>>& filter_array, const std::vector<int>& sort_order_flags,
const std::vector<int>& null_first_flags, uint32_t& least_num, uint32_t& middle_num);
Expand Down
Loading