@@ -36,8 +36,6 @@ class TSampleKScan final: public TActor<TSampleKScan>, public NTable::IScan {
3636 const TSerializedTableRange RequestedRange;
3737 const ui64 K;
3838
39- IDriver* Driver = nullptr ;
40-
4139 struct TProbability {
4240 ui64 P = 0 ;
4341 ui64 I = 0 ;
@@ -46,11 +44,18 @@ class TSampleKScan final: public TActor<TSampleKScan>, public NTable::IScan {
4644 auto operator <=>(const TProbability&) const noexcept = default ;
4745 };
4846
49- TReallyFastRng32 Rng;
47+ ui64 RowsCount = 0 ;
48+ ui64 RowsBytes = 0 ;
49+
50+ // We are using binary heap, because we don't want to do batch processing here,
51+ // serialization is more expensive than compare
5052 ui64 MaxProbability = 0 ;
53+ TReallyFastRng32 Rng;
5154 std::vector<TProbability> MaxRows;
5255 std::vector<TString> DataRows;
5356
57+ IDriver* Driver = nullptr ;
58+
5459public:
5560 static constexpr NKikimrServices::TActivity::EType ActorActivityType () {
5661 return NKikimrServices::TActivity::SAMPLE_K_SCAN_ACTOR;
@@ -61,6 +66,7 @@ class TSampleKScan final: public TActor<TSampleKScan>, public NTable::IScan {
6166 const TSerializedTableRange& range,
6267 ui64 k,
6368 ui64 seed,
69+ ui64 maxProbability,
6470 TProtoColumnsCRef columns,
6571 const TUserTable& tableInfo)
6672 : TActor(&TThis::StateWork)
@@ -71,8 +77,10 @@ class TSampleKScan final: public TActor<TSampleKScan>, public NTable::IScan {
7177 , TableRange(tableInfo.Range)
7278 , RequestedRange(range)
7379 , K(k)
80+ , MaxProbability(maxProbability)
7481 , Rng(seed)
7582 {
83+ Y_ASSERT (MaxProbability != 0 );
7684 }
7785
7886 ~TSampleKScan () final = default ;
@@ -109,19 +117,32 @@ class TSampleKScan final: public TActor<TSampleKScan>, public NTable::IScan {
109117
110118 EScan Feed (TArrayRef<const TCell> key, const TRow& row) noexcept final {
111119 LOG_T (" Feed key " << DebugPrintPoint (KeyTypes, key, *AppData ()->TypeRegistry ) << " " << Debug ());
120+ ++RowsCount;
121+
122+ const auto probability = GetProbability ();
123+ if (probability > MaxProbability) {
124+ // TODO(mbkkt) it's not nice that we need to compute this, probably can be precomputed in TRow
125+ RowsBytes += TSerializedCellVec::SerializedSize (*row);
126+ return EScan::Feed;
127+ }
128+
129+ auto serialized = TSerializedCellVec::Serialize (*row);
130+ RowsBytes += serialized.size ();
112131
113- const auto probability = Rng.GenRand64 ();
114132 if (DataRows.size () < K) {
115133 MaxRows.push_back ({probability, DataRows.size ()});
116- DataRows.emplace_back (TSerializedCellVec::Serialize (*row ));
134+ DataRows.emplace_back (std::move (serialized ));
117135 if (DataRows.size () == K) {
118136 std::make_heap (MaxRows.begin (), MaxRows.end ());
119137 MaxProbability = MaxRows.front ().P ;
120138 }
121- } else if (probability < MaxProbability) {
122- ReplaceRow (row , probability);
139+ } else {
140+ ReplaceRow (std::move (serialized) , probability);
123141 }
124142
143+ if (MaxProbability == 0 ) {
144+ return EScan::Final;
145+ }
125146 return EScan::Feed;
126147 }
127148
@@ -167,21 +188,35 @@ class TSampleKScan final: public TActor<TSampleKScan>, public NTable::IScan {
167188 }
168189 }
169190
170- void ReplaceRow (const TRow& row, ui64 p) {
191+ void ReplaceRow (TString&& row, ui64 p) {
192+ // TODO(mbkkt) use tournament tree to make less compare and swaps
171193 std::pop_heap (MaxRows.begin (), MaxRows.end ());
172- DataRows[MaxRows.back ().I ] = TSerializedCellVec::Serialize (* row);
194+ DataRows[MaxRows.back ().I ] = std::move ( row);
173195 MaxRows.back ().P = p;
174196 std::push_heap (MaxRows.begin (), MaxRows.end ());
175197 MaxProbability = MaxRows.front ().P ;
176198 }
177199
178200 void FillResponse () {
179201 std::sort (MaxRows.begin (), MaxRows.end ());
202+ auto & record = Response->Record ;
180203 for (auto & [p, i] : MaxRows) {
181- Response->Record .AddProbabilities (p);
182- Response->Record .AddRows (std::move (DataRows[i]));
204+ record.AddProbabilities (p);
205+ record.AddRows (std::move (DataRows[i]));
206+ }
207+ record.SetRowsDelta (RowsCount);
208+ record.SetBytesDelta (RowsBytes);
209+ record.SetStatus (NKikimrIndexBuilder::EBuildStatus::DONE);
210+ }
211+
212+ ui64 GetProbability () {
213+ while (true ) {
214+ auto p = Rng.GenRand64 ();
215+ // We exclude max ui64 from generated probabilities, so we can use this value as initial max
216+ if (Y_LIKELY (p != std::numeric_limits<ui64>::max ())) {
217+ return p;
218+ }
183219 }
184- Response->Record .SetStatus (NKikimrIndexBuilder::EBuildStatus::DONE);
185220 }
186221};
187222
@@ -317,6 +352,7 @@ void TDataShard::HandleSafe(TEvDataShard::TEvSampleKRequest::TPtr& ev, const TAc
317352 requestedRange,
318353 record.GetK (),
319354 record.GetSeed (),
355+ record.GetMaxProbability (),
320356 record.GetColumns (),
321357 userTable),
322358 ev->Cookie ,
0 commit comments