Skip to content

Commit 319908d

Browse files
authored
25-1-1: schemeshard: fix split key selection from access sample (#18899)
2 parents 8eab311 + 5cb27b9 commit 319908d

File tree

3 files changed

+564
-10
lines changed

3 files changed

+564
-10
lines changed

ydb/core/tx/schemeshard/schemeshard__table_stats_histogram.cpp

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -140,17 +140,29 @@ TSerializedCellVec DoFindSplitKey(const TVector<std::pair<TSerializedCellVec, ui
140140
auto loIt = std::upper_bound(keysHist.begin(), keysHist.end(), total*0.1, fnValueLess);
141141
auto hiIt = std::upper_bound(keysHist.begin(), keysHist.end(), total*0.9, fnValueLess);
142142

143-
auto fnCmp = [&keyColumnTypes, prefixSize] (const auto& bucket1, const auto& bucket2) {
144-
return CompareTypedCellVectors(bucket1.first.GetCells().data(), bucket2.first.GetCells().data(),
145-
keyColumnTypes.data(),
146-
std::min(bucket1.first.GetCells().size(), prefixSize), std::min(bucket2.first.GetCells().size(), prefixSize));
143+
// compare histogram entries by key prefixes
144+
auto comparePrefix = [&keyColumnTypes] (const auto& entry1, const auto& entry2, const size_t prefixSize) {
145+
const auto& key1cells = entry1.first.GetCells();
146+
const auto clampedSize1 = std::min(key1cells.size(), prefixSize);
147+
148+
const auto& key2cells = entry2.first.GetCells();
149+
const auto clampedSize2 = std::min(key2cells.size(), prefixSize);
150+
151+
int cmp = CompareTypedCellVectors(key1cells.data(), key2cells.data(), keyColumnTypes.data(), std::min(clampedSize1, clampedSize2));
152+
if (cmp == 0 && clampedSize1 != clampedSize2) {
153+
// smaller key prefix is filled with +inf => always bigger
154+
cmp = (clampedSize1 < clampedSize2) ? +1 : -1;
155+
}
156+
return cmp;
147157
};
148158

149159
// Check if half key is no equal to low and high keys
150-
if (fnCmp(*halfIt, *loIt) == 0)
160+
if (comparePrefix(*halfIt, *loIt, prefixSize) == 0) {
151161
return TSerializedCellVec();
152-
if (fnCmp(*halfIt, *hiIt) == 0)
162+
}
163+
if (comparePrefix(*halfIt, *hiIt, prefixSize) == 0) {
153164
return TSerializedCellVec();
165+
}
154166

155167
// Build split key by leaving the prefix and extending it with NULLs
156168
TVector<TCell> splitKey(halfIt->first.GetCells().begin(), halfIt->first.GetCells().end());
@@ -170,10 +182,17 @@ TSerializedCellVec ChooseSplitKeyByKeySample(const NKikimrTableStats::THistogram
170182
keysHist.emplace_back(std::make_pair(TSerializedCellVec(bucket.GetKey()), bucket.GetValue()));
171183
}
172184

173-
auto fnCmp = [&keyColumnTypes] (const auto& key1, const auto& key2) {
174-
return CompareTypedCellVectors(key1.first.GetCells().data(), key2.first.GetCells().data(),
175-
keyColumnTypes.data(),
176-
key1.first.GetCells().size(), key2.first.GetCells().size());
185+
// compare histogram entries by keys
186+
auto fnCmp = [&keyColumnTypes] (const auto& entry1, const auto& entry2) {
187+
const auto& key1cells = entry1.first.GetCells();
188+
const auto& key2cells = entry2.first.GetCells();
189+
const auto minKeySize = std::min(key1cells.size(), key2cells.size());
190+
int cmp = CompareTypedCellVectors(key1cells.data(), key2cells.data(), keyColumnTypes.data(), minKeySize);
191+
if (cmp == 0 && key1cells.size() != key2cells.size()) {
192+
// smaller key is filled with +inf => always bigger
193+
cmp = (key1cells.size() < key2cells.size()) ? +1 : -1;
194+
}
195+
return cmp;
177196
};
178197

179198
Sort(keysHist, [&fnCmp] (const auto& key1, const auto& key2) { return fnCmp(key1, key2) < 0; });

0 commit comments

Comments
 (0)