@@ -48,7 +48,7 @@ class TTableHistogramBuilderBtreeIndex {
4848 {
4949 }
5050
51- TString ToString () const noexcept {
51+ TString ToString (const TKeyCellDefaults &keyDefaults ) const {
5252 return TStringBuilder ()
5353 << " Part: " << Part->Label .ToString ()
5454 << " PageId: " << PageId
@@ -57,8 +57,8 @@ class TTableHistogramBuilderBtreeIndex {
5757 << " EndRowId: " << EndRowId
5858 << " BeginDataSize: " << BeginDataSize
5959 << " EndDataSize: " << EndDataSize
60- << " BeginKey: " << BeginKey. Count ( )
61- << " EndKey: " << EndKey. Count ( )
60+ << " BeginKey: " << NFmt::Do (BeginKey, keyDefaults )
61+ << " EndKey: " << NFmt::Do (EndKey, keyDefaults )
6262 << " State: " << (ui32)State;
6363 }
6464
@@ -130,15 +130,17 @@ class TTableHistogramBuilderBtreeIndex {
130130 };
131131
132132 struct TEvent {
133- TCellsIterable Key;
134- bool IsBegin;
135133 TNodeState* Node;
134+ bool IsBegin;
136135
137- TString ToString () const noexcept {
136+ TString ToString (const TKeyCellDefaults &keyDefaults ) const {
138137 return TStringBuilder ()
139- << Node->ToString ()
140- << " IsBegin: " << IsBegin
141- << " Key: " << Key.Count ();
138+ << " IsBegin: " << IsBegin
139+ << " " << Node->ToString (keyDefaults);
140+ }
141+
142+ const TCellsIterable& GetKey () const {
143+ return IsBegin ? Node->BeginKey : Node->EndKey ;
142144 }
143145 };
144146
@@ -149,7 +151,7 @@ class TTableHistogramBuilderBtreeIndex {
149151 return Compare (a, b) > 0 ;
150152 }
151153
152- i8 Compare (const TEvent& a, const TEvent& b) const noexcept {
154+ int Compare (const TEvent& a, const TEvent& b) const {
153155 // events go in order:
154156 // - Key = {}, IsBegin = true
155157 // - ...
@@ -161,13 +163,16 @@ class TTableHistogramBuilderBtreeIndex {
161163 // - ...
162164 // - Key = {}, IsBegin = false
163165
164- if (a.Key && b.Key ) { // compare by keys
165- auto cmp = CompareKeys (a.Key , b.Key , KeyDefaults);
166+ // end goes before begin in order to
167+ // close previous node before open the next one
168+
169+ if (a.GetKey () && b.GetKey ()) { // compare by keys
170+ auto cmp = CompareKeys (a.GetKey (), b.GetKey (), KeyDefaults);
166171 if (cmp != 0 ) {
167172 return cmp;
168173 }
169174 // keys are the same, compare by begin flag, end events first:
170- return Compare (a.IsBegin ? 1 : -1 , b.IsBegin ? 1 : -1 );
175+ return Compare (a.IsBegin ? + 1 : -1 , b.IsBegin ? + 1 : -1 );
171176 }
172177
173178 // category = -1 for Key = { }, IsBegin = true
@@ -177,14 +182,14 @@ class TTableHistogramBuilderBtreeIndex {
177182 }
178183
179184 private:
180- static i8 GetCategory (const TEvent& a) noexcept {
181- if (a.Key ) {
185+ static int GetCategory (const TEvent& a) {
186+ if (a.GetKey () ) {
182187 return 0 ;
183188 }
184189 return a.IsBegin ? -1 : +1 ;
185190 }
186191
187- static i8 Compare (i8 a, i8 b) noexcept {
192+ static int Compare (int a, int b) {
188193 if (a < b) return -1 ;
189194 if (a > b) return +1 ;
190195 return 0 ;
@@ -226,6 +231,9 @@ class TTableHistogramBuilderBtreeIndex {
226231
227232 for (auto index : xrange (Subset.Flatten .size ())) {
228233 auto & part = Subset.Flatten [index];
234+ if (part.Slices ) {
235+ LOG_BUILD_STATS (" slicing part " << part->Label << " : " << NFmt::Do (*part.Slices , KeyDefaults));
236+ }
229237 auto & meta = part->IndexPages .GetBTree ({});
230238 TCellsIterable beginKey = EmptyKey;
231239 if (part.Slices && part.Slices ->front ().FirstKey .GetCells ()) {
@@ -235,7 +243,7 @@ class TTableHistogramBuilderBtreeIndex {
235243 if (part.Slices && part.Slices ->back ().LastKey .GetCells ()) {
236244 endKey = MakeCellsIterableKey (part.Part .Get (), part.Slices ->back ().LastKey );
237245 }
238- LoadedStateNodes.emplace_back (part.Part .Get (), meta.GetPageId (), meta.LevelCount , 0 , meta.GetRowCount (), 0 , meta.GetDataSize (), beginKey, endKey);
246+ LoadedStateNodes.emplace_back (part.Part .Get (), meta.GetPageId (), meta.LevelCount , 0 , meta.GetRowCount (), 0 , meta.GetTotalDataSize (), beginKey, endKey);
239247 ready &= SlicePart (*part.Slices , LoadedStateNodes.back ());
240248 }
241249
@@ -261,13 +269,13 @@ class TTableHistogramBuilderBtreeIndex {
261269
262270 if (it == slices.end () || node.EndRowId <= it->BeginRowId () || it->EndRowId () <= node.BeginRowId ) {
263271 // skip the node
264- LOG_BUILD_STATS (" slicing node " << node.ToString () << " => skip" );
272+ LOG_BUILD_STATS (" slicing node " << node.ToString (KeyDefaults ) << " => skip" );
265273 return true ;
266274 }
267275
268276 if (it->BeginRowId () <= node.BeginRowId && node.EndRowId <= it->EndRowId ()) {
269277 // take the node
270- LOG_BUILD_STATS (" slicing node " << node.ToString () << " => take" );
278+ LOG_BUILD_STATS (" slicing node " << node.ToString (KeyDefaults ) << " => take" );
271279 AddFutureEvents (node);
272280 return true ;
273281 }
@@ -278,17 +286,20 @@ class TTableHistogramBuilderBtreeIndex {
278286 // can't split, decide by node.EndRowId - 1
279287 // TODO: decide by non-empty slice and node intersection, but this requires size calculation changes too
280288 if (it->Has (node.EndRowId - 1 )) {
281- LOG_BUILD_STATS (" slicing node " << node.ToString () << " => take root" );
289+ LOG_BUILD_STATS (" slicing node " << node.ToString (KeyDefaults) << " => take leaf" );
290+ // the slice may start after node begin, shift the node begin to make it more sensible
291+ node.BeginRowId = it->BeginRowId ();
292+ node.BeginKey = MakeCellsIterableKey (node.Part , it->FirstKey );
282293 AddFutureEvents (node);
283294 } else {
284- LOG_BUILD_STATS (" slicing node " << node.ToString () << " => skip root " );
295+ LOG_BUILD_STATS (" slicing node " << node.ToString (KeyDefaults ) << " => skip leaf " );
285296 }
286297 return true ;
287298 }
288299
289300 bool ready = true ;
290301
291- LOG_BUILD_STATS (" slicing node " << node.ToString () << " => split" );
302+ LOG_BUILD_STATS (" slicing node " << node.ToString (KeyDefaults ) << " => split" );
292303 const auto addNode = [&](TNodeState& child) {
293304 ready &= SlicePart (slices, child);
294305 };
@@ -341,10 +352,10 @@ class TTableHistogramBuilderBtreeIndex {
341352 << " openedSortedByRowCount: " << openedSortedByRowCount.size ()
342353 << " openedSortedByDataSize: " << openedSortedByDataSize.size ()
343354 << " FutureEvents: " << FutureEvents.size ()
344- << " currentKeyPointer: " << currentKeyPointer.ToString ());
355+ << " currentKeyPointer: " << currentKeyPointer.ToString (KeyDefaults ));
345356
346357 auto processEvent = [&](const TEvent& event) {
347- LOG_BUILD_STATS (" processing event " << event.ToString ());
358+ LOG_BUILD_STATS (" processing event " << event.ToString (KeyDefaults ));
348359 Y_DEBUG_ABORT_UNLESS (NodeEventKeyGreater.Compare (event, currentKeyPointer) <= 0 , " Can't process future events" );
349360 if (event.IsBegin ) {
350361 if (event.Node ->Open (openedRowCount, openedDataSize)) {
@@ -370,7 +381,7 @@ class TTableHistogramBuilderBtreeIndex {
370381 // TODO: skip all closed nodes and don't process them here
371382 // TODO: don't compare each node key and replace it with parentNode.Seek(currentKeyPointer)
372383 auto cmp = NodeEventKeyGreater.Compare (event, currentKeyPointer);
373- LOG_BUILD_STATS (" adding event " << (i32 )cmp << " " << event.ToString ());
384+ LOG_BUILD_STATS (" adding event " << (i32 )cmp << " " << event.ToString (KeyDefaults ));
374385 if (cmp <= 0 ) { // event happened
375386 processEvent (event);
376387 if (cmp == 0 ) {
@@ -381,8 +392,8 @@ class TTableHistogramBuilderBtreeIndex {
381392 }
382393 };
383394 const auto addNode = [&](TNodeState& node) {
384- addEvent (TEvent{node. BeginKey , true , &node });
385- addEvent (TEvent{node. EndKey , false , &node });
395+ addEvent (TEvent{& node, true });
396+ addEvent (TEvent{& node, false });
386397 };
387398
388399 // may safely skip current key pointer and go further only if at the next iteration
@@ -395,7 +406,7 @@ class TTableHistogramBuilderBtreeIndex {
395406 openedSortedByRowCount.pop ();
396407
397408 LOG_BUILD_STATS (" loading node by row count trigger"
398- << node->ToString ()
409+ << node->ToString (KeyDefaults )
399410 << " closedRowCount: " << closedRowCount
400411 << " openedRowCount: " << openedRowCount
401412 << " nextHistogramRowCount: " << nextHistogramRowCount);
@@ -413,7 +424,7 @@ class TTableHistogramBuilderBtreeIndex {
413424 openedSortedByDataSize.pop ();
414425
415426 LOG_BUILD_STATS (" loading node by data size trigger"
416- << node->ToString ()
427+ << node->ToString (KeyDefaults )
417428 << " closedDataSize: " << closedDataSize
418429 << " openedDataSize: " << openedDataSize
419430 << " nextHistogramDataSize: " << nextHistogramDataSize);
@@ -439,7 +450,7 @@ class TTableHistogramBuilderBtreeIndex {
439450 << " openedSortedByRowCount: " << openedSortedByRowCount.size ()
440451 << " openedSortedByDataSize: " << openedSortedByDataSize.size ()
441452 << " FutureEvents: " << FutureEvents.size ()
442- << " currentKeyPointer: " << currentKeyPointer.ToString ());
453+ << " currentKeyPointer: " << currentKeyPointer.ToString (KeyDefaults ));
443454
444455 // add current key pointer to a histogram if we either:
445456 // - failed to split opened nodes and may exceed a next histogram bucket value (plus its gaps)
@@ -449,7 +460,7 @@ class TTableHistogramBuilderBtreeIndex {
449460 // - minus size of all nodes that start at current key pointer
450461 // - plus half of size of all ohter opened nodes (as they exact position is unknown)
451462 // also check that current key pointer value is > then last presented value in a histogram
452- if (currentKeyPointer.Key ) {
463+ if (currentKeyPointer.GetKey () ) {
453464 if (nextHistogramRowCount != Max<ui64>()) {
454465 if (closedRowCount + openedRowCount > nextHistogramRowCount + RowCountResolutionGap || closedRowCount > nextHistogramRowCount - RowCountResolutionGap) {
455466 ui64 currentKeyRowCountOpens = 0 ;
@@ -461,7 +472,7 @@ class TTableHistogramBuilderBtreeIndex {
461472 Y_ABORT_UNLESS (currentKeyRowCountOpens <= openedRowCount);
462473 ui64 currentKeyPointerRowCount = closedRowCount + (openedRowCount - currentKeyRowCountOpens) / 2 ;
463474 if ((stats.RowCountHistogram .empty () ? 0 : stats.RowCountHistogram .back ().Value ) < currentKeyPointerRowCount && currentKeyPointerRowCount < stats.RowCount ) {
464- AddKey (stats.RowCountHistogram , currentKeyPointer.Key , currentKeyPointerRowCount);
475+ AddKey (stats.RowCountHistogram , currentKeyPointer.GetKey () , currentKeyPointerRowCount);
465476 nextHistogramRowCount = Max (currentKeyPointerRowCount + 1 , nextHistogramRowCount + RowCountResolution);
466477 if (nextHistogramRowCount + RowCountResolutionGap > stats.RowCount ) {
467478 nextHistogramRowCount = Max<ui64>();
@@ -480,7 +491,7 @@ class TTableHistogramBuilderBtreeIndex {
480491 Y_ABORT_UNLESS (currentKeyDataSizeOpens <= openedDataSize);
481492 ui64 currentKeyPointerDataSize = closedDataSize + (openedDataSize - currentKeyDataSizeOpens) / 2 ;
482493 if ((stats.DataSizeHistogram .empty () ? 0 : stats.DataSizeHistogram .back ().Value ) < currentKeyPointerDataSize && currentKeyPointerDataSize < stats.DataSize .Size ) {
483- AddKey (stats.DataSizeHistogram , currentKeyPointer.Key , currentKeyPointerDataSize);
494+ AddKey (stats.DataSizeHistogram , currentKeyPointer.GetKey () , currentKeyPointerDataSize);
484495 nextHistogramDataSize = Max (currentKeyPointerDataSize + 1 , nextHistogramDataSize + DataSizeResolution);
485496 if (nextHistogramDataSize + DataSizeResolutionGap > stats.DataSize .Size ) {
486497 nextHistogramDataSize = Max<ui64>();
@@ -507,7 +518,7 @@ class TTableHistogramBuilderBtreeIndex {
507518 return true ;
508519 }
509520
510- void AddKey (THistogram& histogram, TCellsIterable& key, ui64 value) {
521+ void AddKey (THistogram& histogram, const TCellsIterable& key, ui64 value) {
511522 TVector<TCell> keyCells;
512523
513524 // add columns that are present in the part:
@@ -555,8 +566,14 @@ class TTableHistogramBuilderBtreeIndex {
555566 }
556567
557568 void AddFutureEvents (TNodeState& node) {
558- FutureEvents.push (TEvent{node.BeginKey , true , &node});
559- FutureEvents.push (TEvent{node.EndKey , false , &node});
569+ auto cmp = NodeEventKeyGreater.Compare (TEvent{&node, true }, TEvent{&node, false });
570+ LOG_BUILD_STATS (" adding node future events " << (i32 )cmp << " " << node.ToString (KeyDefaults));
571+ if (node.GetRowCount () > 1 ) {
572+ Y_DEBUG_ABORT_UNLESS (cmp < 0 );
573+ }
574+
575+ FutureEvents.push (TEvent{&node, true });
576+ FutureEvents.push (TEvent{&node, false });
560577 }
561578
562579private:
0 commit comments