@@ -140,17 +140,29 @@ TSerializedCellVec DoFindSplitKey(const TVector<std::pair<TSerializedCellVec, ui
140140 auto loIt = std::upper_bound (keysHist.begin (), keysHist.end (), total*0.1 , fnValueLess);
141141 auto hiIt = std::upper_bound (keysHist.begin (), keysHist.end (), total*0.9 , fnValueLess);
142142
143- auto fnCmp = [&keyColumnTypes, prefixSize] (const auto & bucket1, const auto & bucket2) {
144- return CompareTypedCellVectors (bucket1.first .GetCells ().data (), bucket2.first .GetCells ().data (),
145- keyColumnTypes.data (),
146- std::min (bucket1.first .GetCells ().size (), prefixSize), std::min (bucket2.first .GetCells ().size (), prefixSize));
143+ // compare histogram entries by key prefixes
144+ auto comparePrefix = [&keyColumnTypes] (const auto & entry1, const auto & entry2, const size_t prefixSize) {
145+ const auto & key1cells = entry1.first .GetCells ();
146+ const auto clampedSize1 = std::min (key1cells.size (), prefixSize);
147+
148+ const auto & key2cells = entry2.first .GetCells ();
149+ const auto clampedSize2 = std::min (key2cells.size (), prefixSize);
150+
151+ int cmp = CompareTypedCellVectors (key1cells.data (), key2cells.data (), keyColumnTypes.data (), std::min (clampedSize1, clampedSize2));
152+ if (cmp == 0 && clampedSize1 != clampedSize2) {
153+ // smaller key prefix is filled with +inf => always bigger
154+ cmp = (clampedSize1 < clampedSize2) ? +1 : -1 ;
155+ }
156+ return cmp;
147157 };
148158
149159 // Check if half key is no equal to low and high keys
150- if (fnCmp (*halfIt, *loIt) == 0 )
160+ if (comparePrefix (*halfIt, *loIt, prefixSize ) == 0 ) {
151161 return TSerializedCellVec ();
152- if (fnCmp (*halfIt, *hiIt) == 0 )
162+ }
163+ if (comparePrefix (*halfIt, *hiIt, prefixSize) == 0 ) {
153164 return TSerializedCellVec ();
165+ }
154166
155167 // Build split key by leaving the prefix and extending it with NULLs
156168 TVector<TCell> splitKey (halfIt->first .GetCells ().begin (), halfIt->first .GetCells ().end ());
@@ -170,10 +182,17 @@ TSerializedCellVec ChooseSplitKeyByKeySample(const NKikimrTableStats::THistogram
170182 keysHist.emplace_back (std::make_pair (TSerializedCellVec (bucket.GetKey ()), bucket.GetValue ()));
171183 }
172184
173- auto fnCmp = [&keyColumnTypes] (const auto & key1, const auto & key2) {
174- return CompareTypedCellVectors (key1.first .GetCells ().data (), key2.first .GetCells ().data (),
175- keyColumnTypes.data (),
176- key1.first .GetCells ().size (), key2.first .GetCells ().size ());
185+ // compare histogram entries by keys
186+ auto fnCmp = [&keyColumnTypes] (const auto & entry1, const auto & entry2) {
187+ const auto & key1cells = entry1.first .GetCells ();
188+ const auto & key2cells = entry2.first .GetCells ();
189+ const auto minKeySize = std::min (key1cells.size (), key2cells.size ());
190+ int cmp = CompareTypedCellVectors (key1cells.data (), key2cells.data (), keyColumnTypes.data (), minKeySize);
191+ if (cmp == 0 && key1cells.size () != key2cells.size ()) {
192+ // smaller key is filled with +inf => always bigger
193+ cmp = (key1cells.size () < key2cells.size ()) ? +1 : -1 ;
194+ }
195+ return cmp;
177196 };
178197
179198 Sort (keysHist, [&fnCmp] (const auto & key1, const auto & key2) { return fnCmp (key1, key2) < 0 ; });
0 commit comments