44
55namespace NKikimr ::NArrow::NMerger {
66
7- void TMergePartialStream::PutControlPoint (std::shared_ptr<TSortableBatchPosition> point) {
8- Y_ABORT_UNLESS (point);
9- AFL_VERIFY (point->IsSameSortingSchema (SortSchema))(" point" , point->DebugJson ())(" schema" , SortSchema->ToString ());
10- Y_ABORT_UNLESS (point->IsReverseSort () == Reverse);
7+ void TMergePartialStream::PutControlPoint (const TSortableBatchPosition& point) {
8+ AFL_VERIFY (point.IsSameSortingSchema (SortSchema))(" point" , point.DebugJson ())(" schema" , SortSchema->ToString ());
9+ Y_ABORT_UNLESS (point.IsReverseSort () == Reverse);
1110 Y_ABORT_UNLESS (++ControlPoints == 1 );
1211
13- SortHeap.Push (TBatchIterator (* point));
12+ SortHeap.Push (TBatchIterator (point. BuildRWPosition () ));
1413}
1514
1615void TMergePartialStream::RemoveControlPoint () {
@@ -21,14 +20,15 @@ void TMergePartialStream::RemoveControlPoint() {
2120 SortHeap.RemoveTop ();
2221}
2322
24- void TMergePartialStream::CheckSequenceInDebug (const TSortableBatchPosition & nextKeyColumnsPosition) {
23+ void TMergePartialStream::CheckSequenceInDebug (const TRWSortableBatchPosition & nextKeyColumnsPosition) {
2524#ifndef NDEBUG
25+ auto nextCursor = nextKeyColumnsPosition.BuildSortingCursor ();
2626 if (CurrentKeyColumns) {
27- const bool linearExecutionCorrectness = CurrentKeyColumns->Compare (nextKeyColumnsPosition ) == std::partial_ordering::less;
27+ const bool linearExecutionCorrectness = CurrentKeyColumns->Compare (nextCursor ) == std::partial_ordering::less;
2828 if (!linearExecutionCorrectness) {
29- const bool newSegmentScan = nextKeyColumnsPosition .GetPosition () == 0 ;
30- AFL_VERIFY (newSegmentScan && nextKeyColumnsPosition .Compare (*CurrentKeyColumns) == std::partial_ordering::less)
31- (" merge_debug" , DebugJson ())(" current_ext" , nextKeyColumnsPosition .DebugJson ())(" newSegmentScan" , newSegmentScan);
29+ const bool newSegmentScan = nextCursor .GetPosition () == 0 ;
30+ AFL_VERIFY (newSegmentScan && nextCursor .Compare (*CurrentKeyColumns) == std::partial_ordering::less)
31+ (" merge_debug" , DebugJson ())(" current_ext" , nextCursor .DebugJson ())(" newSegmentScan" , newSegmentScan);
3232 }
3333 }
3434 CurrentKeyColumns = nextKeyColumnsPosition;
@@ -37,38 +37,40 @@ void TMergePartialStream::CheckSequenceInDebug(const TSortableBatchPosition& nex
3737#endif
3838}
3939
40- bool TMergePartialStream::DrainToControlPoint (TRecordBatchBuilder& builder, const bool includeFinish, std::optional<TSortableBatchPosition >* lastResultPosition) {
40+ bool TMergePartialStream::DrainToControlPoint (TRecordBatchBuilder& builder, const bool includeFinish, std::optional<TCursor >* lastResultPosition) {
4141 AFL_VERIFY (ControlPoints == 1 );
4242 Y_ABORT_UNLESS ((ui32)DataSchema->num_fields () == builder.GetBuildersCount ());
4343 builder.ValidateDataSchema (DataSchema);
4444 bool cpReachedFlag = false ;
45+ std::shared_ptr<TSortableScanData> resultScanData;
46+ ui64 resultPosition;
4547 while (SortHeap.Size () && !cpReachedFlag && !builder.IsBufferExhausted ()) {
4648 if (SortHeap.Current ().IsControlPoint ()) {
47- auto keyColumns = SortHeap.Current ().GetKeyColumns ();
49+ auto keyColumns = SortHeap.Current ().GetKeyColumns (). BuildSortingCursor () ;
4850 RemoveControlPoint ();
4951 cpReachedFlag = true ;
5052 if (SortHeap.Empty () || !includeFinish || SortHeap.Current ().GetKeyColumns ().Compare (keyColumns) == std::partial_ordering::greater) {
53+ if (lastResultPosition && resultScanData) {
54+ *lastResultPosition = resultScanData->BuildCursor (resultPosition);
55+ }
5156 return true ;
5257 }
5358 }
5459
55- if (auto currentPosition = DrainCurrentPosition ()) {
56- CheckSequenceInDebug (*currentPosition);
57- builder.AddRecord (*currentPosition);
58- if (lastResultPosition) {
59- *lastResultPosition = *currentPosition;
60- }
61- }
60+ DrainCurrentPosition (&builder, &resultScanData, &resultPosition);
61+ }
62+ if (lastResultPosition && resultScanData) {
63+ *lastResultPosition = resultScanData->BuildCursor (resultPosition);
6264 }
6365 return cpReachedFlag;
6466}
6567
66- bool TMergePartialStream::DrainCurrentTo (TRecordBatchBuilder& builder, const TSortableBatchPosition& readTo, const bool includeFinish, std::optional<TSortableBatchPosition >* lastResultPosition) {
67- PutControlPoint (std::make_shared<TSortableBatchPosition>( readTo) );
68+ bool TMergePartialStream::DrainCurrentTo (TRecordBatchBuilder& builder, const TSortableBatchPosition& readTo, const bool includeFinish, std::optional<TCursor >* lastResultPosition) {
69+ PutControlPoint (readTo);
6870 return DrainToControlPoint (builder, includeFinish, lastResultPosition);
6971}
7072
71- std::shared_ptr<arrow::Table> TMergePartialStream::SingleSourceDrain (const TSortableBatchPosition& readTo, const bool includeFinish, std::optional<TSortableBatchPosition >* lastResultPosition) {
73+ std::shared_ptr<arrow::Table> TMergePartialStream::SingleSourceDrain (const TSortableBatchPosition& readTo, const bool includeFinish, std::optional<TCursor >* lastResultPosition) {
7274 std::shared_ptr<arrow::Table> result;
7375 if (SortHeap.Empty ()) {
7476 return result;
@@ -100,7 +102,7 @@ std::shared_ptr<arrow::Table> TMergePartialStream::SingleSourceDrain(const TSort
100102 result = SortHeap.Current ().GetKeyColumns ().SliceData (pos.GetPosition () + (include ? 0 : 1 ), resultSize);
101103 if (lastResultPosition && resultSize) {
102104 auto keys = SortHeap.Current ().GetKeyColumns ().SliceKeys (pos.GetPosition () + (include ? 0 : 1 ), resultSize);
103- *lastResultPosition = TSortableBatchPosition (keys, 0 , SortSchema->field_names (), {}, true );
105+ *lastResultPosition = TCursor (keys, 0 , SortSchema->field_names ());
104106 }
105107 if (SortHeap.Current ().GetFilter ()) {
106108 SortHeap.Current ().GetFilter ()->Apply (result, pos.GetPosition () + (include ? 0 : 1 ), resultSize);
@@ -109,7 +111,7 @@ std::shared_ptr<arrow::Table> TMergePartialStream::SingleSourceDrain(const TSort
109111 result = SortHeap.Current ().GetKeyColumns ().SliceData (startPos, resultSize);
110112 if (lastResultPosition && resultSize) {
111113 auto keys = SortHeap.Current ().GetKeyColumns ().SliceKeys (startPos, resultSize);
112- *lastResultPosition = TSortableBatchPosition (keys, keys->num_rows () - 1 , SortSchema->field_names (), {}, false );
114+ *lastResultPosition = TCursor (keys, keys->num_rows () - 1 , SortSchema->field_names ());
113115 }
114116 if (SortHeap.Current ().GetFilter ()) {
115117 SortHeap.Current ().GetFilter ()->Apply (result, startPos, resultSize);
@@ -144,38 +146,43 @@ std::shared_ptr<arrow::Table> TMergePartialStream::SingleSourceDrain(const TSort
144146void TMergePartialStream::DrainAll (TRecordBatchBuilder& builder) {
145147 Y_ABORT_UNLESS ((ui32)DataSchema->num_fields () == builder.GetBuildersCount ());
146148 while (SortHeap.Size ()) {
147- if (auto currentPosition = DrainCurrentPosition ()) {
148- CheckSequenceInDebug (*currentPosition);
149- builder.AddRecord (*currentPosition);
150- }
149+ DrainCurrentPosition (&builder, nullptr , nullptr );
151150 }
152151}
153152
154- std::optional<TSortableBatchPosition> TMergePartialStream::DrainCurrentPosition () {
153+ void TMergePartialStream::DrainCurrentPosition (TRecordBatchBuilder* builder, std::shared_ptr<TSortableScanData>* resultScanData, ui64* resultPosition ) {
155154 Y_ABORT_UNLESS (SortHeap.Size ());
156155 Y_ABORT_UNLESS (!SortHeap.Current ().IsControlPoint ());
157- TSortableBatchPosition result = SortHeap.Current ().GetKeyColumns ();
158- TSortableBatchPosition resultVersion = SortHeap.Current ().GetVersionColumns ();
156+ if (!SortHeap.Current ().IsDeleted ()) {
157+ if (builder) {
158+ builder->AddRecord (SortHeap.Current ().GetKeyColumns ());
159+ }
160+ if (resultScanData && resultPosition) {
161+ *resultScanData = SortHeap.Current ().GetKeyColumns ().GetSorting ();
162+ *resultPosition = SortHeap.Current ().GetKeyColumns ().GetPosition ();
163+ }
164+ }
165+ CheckSequenceInDebug (SortHeap.Current ().GetKeyColumns ());
166+ const ui64 startPosition = SortHeap.Current ().GetKeyColumns ().GetPosition ();
167+ std::shared_ptr<TSortableScanData> startSorting = SortHeap.Current ().GetKeyColumns ().GetSorting ();
168+ std::shared_ptr<TSortableScanData> startVersion = SortHeap.Current ().GetVersionColumns ().GetSorting ();
159169 bool isFirst = true ;
160- const bool deletedFlag = SortHeap.Current ().IsDeleted ();
161- while (SortHeap.Size () && (isFirst || result.Compare (SortHeap.Current ().GetKeyColumns ()) == std::partial_ordering::equivalent)) {
162- auto & anotherIterator = SortHeap.Current ();
170+ while (SortHeap.Size () && (isFirst || SortHeap.Current ().GetKeyColumns ().Compare (*startSorting, startPosition) == std::partial_ordering::equivalent)) {
163171 if (!isFirst) {
172+ auto & anotherIterator = SortHeap.Current ();
164173 if (PossibleSameVersionFlag) {
165- AFL_VERIFY (resultVersion.Compare (anotherIterator.GetVersionColumns ()) != std::partial_ordering::less)(" r" , resultVersion.DebugJson ())(" a" , anotherIterator.GetVersionColumns ().DebugJson ())
166- (" key" , result.DebugJson ());
174+ AFL_VERIFY (anotherIterator.GetVersionColumns ().Compare (*startVersion, startPosition) != std::partial_ordering::greater)
175+ (" r" , startVersion->BuildCursor (startPosition).DebugJson ())(" a" , anotherIterator.GetVersionColumns ().DebugJson ())
176+ (" key" , startSorting->BuildCursor (startPosition).DebugJson ());
167177 } else {
168- AFL_VERIFY (resultVersion.Compare (anotherIterator.GetVersionColumns ()) == std::partial_ordering::greater)(" r" , resultVersion.DebugJson ())(" a" , anotherIterator.GetVersionColumns ().DebugJson ())
169- (" key" , result.DebugJson ());
178+ AFL_VERIFY (anotherIterator.GetVersionColumns ().Compare (*startVersion, startPosition) == std::partial_ordering::less)
179+ (" r" , startVersion->BuildCursor (startPosition).DebugJson ())(" a" , anotherIterator.GetVersionColumns ().DebugJson ())
180+ (" key" , startSorting->BuildCursor (startPosition).DebugJson ());
170181 }
171182 }
172183 SortHeap.Next ();
173184 isFirst = false ;
174185 }
175- if (deletedFlag) {
176- return {};
177- }
178- return result;
179186}
180187
181188std::vector<std::shared_ptr<arrow::RecordBatch>> TMergePartialStream::DrainAllParts (const std::map<TSortableBatchPosition, bool >& positions,
0 commit comments