@@ -28,6 +28,7 @@ namespace NYql::NDq {
28
28
using namespace NActors ;
29
29
30
30
namespace {
31
+ constexpr ui32 RequestRetriesLimit = 10 ; // TODO lookup parameters or PRAGMA?
31
32
32
33
const NKikimr::NMiniKQL::TStructType* MergeStructTypes (const NKikimr::NMiniKQL::TTypeEnvironment& env, const NKikimr::NMiniKQL::TStructType* t1, const NKikimr::NMiniKQL::TStructType* t2) {
33
34
Y_ABORT_UNLESS (t1);
@@ -45,7 +46,7 @@ namespace NYql::NDq {
45
46
template <typename T>
46
47
T ExtractFromConstFuture (const NThreading::TFuture<T>& f) {
47
48
// We want to avoid making a copy of data stored in a future.
48
- // But there is no direct way to extract data from a const future5
49
+ // But there is no direct way to extract data from a const future
49
50
// So, we make a copy of the future, that is cheap. Then, extract the value from this copy.
50
51
// It destructs the value in the original future, but this trick is legal and documented here:
51
52
// https://docs.yandex-team.ru/arcadia-cpp/cookbook/concurrency
@@ -155,20 +156,25 @@ namespace NYql::NDq {
155
156
hFunc (TEvReadSplitsPart, Handle);
156
157
hFunc (TEvReadSplitsFinished, Handle);
157
158
hFunc (TEvError, Handle);
159
+ hFunc (TEvRetry, Handle);
158
160
hFunc (NActors::TEvents::TEvPoison, Handle);)
159
161
160
162
void Handle (TEvListSplitsIterator::TPtr ev) {
161
163
auto & iterator = ev->Get ()->Iterator ;
162
164
iterator->ReadNext ().Subscribe (
163
- [actorSystem = TActivationContext::ActorSystem (), selfId = SelfId ()](const NConnector::TAsyncResult<NConnector::NApi::TListSplitsResponse>& asyncResult) {
165
+ [
166
+ actorSystem = TActivationContext::ActorSystem (),
167
+ selfId = SelfId (),
168
+ retriesRemaining = RetriesRemaining
169
+ ](const NConnector::TAsyncResult<NConnector::NApi::TListSplitsResponse>& asyncResult) {
164
170
YQL_CLOG (DEBUG, ProviderGeneric) << " ActorId=" << selfId << " Got TListSplitsResponse from Connector" ;
165
171
auto result = ExtractFromConstFuture (asyncResult);
166
172
if (result.Status .Ok ()) {
167
173
Y_ABORT_UNLESS (result.Response );
168
174
auto ev = new TEvListSplitsPart (std::move (*result.Response ));
169
175
actorSystem->Send (new NActors::IEventHandle (selfId, selfId, ev));
170
176
} else {
171
- SendError (actorSystem, selfId, result.Status );
177
+ SendRetryOrError (actorSystem, selfId, result.Status , retriesRemaining );
172
178
}
173
179
});
174
180
}
@@ -189,14 +195,18 @@ namespace NYql::NDq {
189
195
*readRequest.add_splits () = split;
190
196
readRequest.Setformat (NConnector::NApi::TReadSplitsRequest_EFormat::TReadSplitsRequest_EFormat_ARROW_IPC_STREAMING);
191
197
readRequest.set_filtering (NConnector::NApi::TReadSplitsRequest::FILTERING_MANDATORY);
192
- Connector->ReadSplits (readRequest).Subscribe ([actorSystem = TActivationContext::ActorSystem (), selfId = SelfId ()](const NConnector::TReadSplitsStreamIteratorAsyncResult& asyncResult) {
198
+ Connector->ReadSplits (readRequest).Subscribe ([
199
+ actorSystem = TActivationContext::ActorSystem (),
200
+ selfId = SelfId (),
201
+ retriesRemaining = RetriesRemaining
202
+ ](const NConnector::TReadSplitsStreamIteratorAsyncResult& asyncResult) {
193
203
YQL_CLOG (DEBUG, ProviderGeneric) << " ActorId=" << selfId << " Got ReadSplitsStreamIterator from Connector" ;
194
204
auto result = ExtractFromConstFuture (asyncResult);
195
205
if (result.Status .Ok ()) {
196
206
auto ev = new TEvReadSplitsIterator (std::move (result.Iterator ));
197
207
actorSystem->Send (new NActors::IEventHandle (selfId, selfId, ev));
198
208
} else {
199
- SendError (actorSystem, selfId, result.Status );
209
+ SendRetryOrError (actorSystem, selfId, result.Status , retriesRemaining );
200
210
}
201
211
});
202
212
}
@@ -225,6 +235,13 @@ namespace NYql::NDq {
225
235
actorSystem->Send (new NActors::IEventHandle (ParentId, SelfId (), errEv.release ()));
226
236
}
227
237
238
+ void Handle (TEvRetry::TPtr) {
239
+ auto guard = Guard (*Alloc);
240
+ Y_ENSURE (RetriesRemaining > 0 );
241
+ --RetriesRemaining;
242
+ SendRequest ();
243
+ }
244
+
228
245
void Handle (NActors::TEvents::TEvPoison::TPtr) {
229
246
PassAway ();
230
247
}
@@ -243,18 +260,22 @@ namespace NYql::NDq {
243
260
if (!request) {
244
261
return ;
245
262
}
246
- auto startCycleCount = GetCycleCountFast ();
247
263
SentTime = TInstant::Now ();
248
264
YQL_CLOG (DEBUG, ProviderGeneric) << " ActorId=" << SelfId () << " Got LookupRequest for " << request->size () << " keys" ;
249
265
Y_ABORT_IF (request->size () == 0 || request->size () > MaxKeysInRequest);
250
-
251
266
if (Count) {
252
267
Count->Inc ();
253
268
InFlight->Inc ();
254
269
Keys->Add (request->size ());
255
270
}
256
271
257
272
Request = std::move (request);
273
+ RetriesRemaining = RequestRetriesLimit;
274
+ SendRequest ();
275
+ }
276
+
277
+ void SendRequest () {
278
+ auto startCycleCount = GetCycleCountFast ();
258
279
NConnector::NApi::TListSplitsRequest splitRequest;
259
280
260
281
auto error = FillSelect (*splitRequest.add_selects ());
@@ -264,15 +285,19 @@ namespace NYql::NDq {
264
285
};
265
286
266
287
splitRequest.Setmax_split_count (1 );
267
- Connector->ListSplits (splitRequest).Subscribe ([actorSystem = TActivationContext::ActorSystem (), selfId = SelfId ()](const NConnector::TListSplitsStreamIteratorAsyncResult& asyncResult) {
288
+ Connector->ListSplits (splitRequest).Subscribe ([
289
+ actorSystem = TActivationContext::ActorSystem (),
290
+ selfId = SelfId (),
291
+ retriesRemaining = RetriesRemaining
292
+ ](const NConnector::TListSplitsStreamIteratorAsyncResult& asyncResult) {
268
293
auto result = ExtractFromConstFuture (asyncResult);
269
294
if (result.Status .Ok ()) {
270
295
YQL_CLOG (DEBUG, ProviderGeneric) << " ActorId=" << selfId << " Got TListSplitsStreamIterator" ;
271
296
Y_ABORT_UNLESS (result.Iterator , " Uninitialized iterator" );
272
297
auto ev = new TEvListSplitsIterator (std::move (result.Iterator ));
273
298
actorSystem->Send (new NActors::IEventHandle (selfId, selfId, ev));
274
299
} else {
275
- SendError (actorSystem, selfId, result.Status );
300
+ SendRetryOrError (actorSystem, selfId, result.Status , retriesRemaining );
276
301
}
277
302
});
278
303
if (CpuTime) {
@@ -282,12 +307,17 @@ namespace NYql::NDq {
282
307
283
308
void ReadNextData () {
284
309
ReadSplitsIterator->ReadNext ().Subscribe (
285
- [actorSystem = TActivationContext::ActorSystem (), selfId = SelfId ()](const NConnector::TAsyncResult<NConnector::NApi::TReadSplitsResponse>& asyncResult) {
310
+ [
311
+ actorSystem = TActivationContext::ActorSystem (),
312
+ selfId = SelfId (),
313
+ retriesRemaining = RetriesRemaining
314
+ ](const NConnector::TAsyncResult<NConnector::NApi::TReadSplitsResponse>& asyncResult) {
286
315
auto result = ExtractFromConstFuture (asyncResult);
287
316
if (result.Status .Ok ()) {
288
317
YQL_CLOG (DEBUG, ProviderGeneric) << " ActorId=" << selfId << " Got DataChunk" ;
289
318
Y_ABORT_UNLESS (result.Response );
290
319
auto & response = *result.Response ;
320
+ // TODO: retry on some YDB errors
291
321
if (NConnector::IsSuccess (response)) {
292
322
auto ev = new TEvReadSplitsPart (std::move (response));
293
323
actorSystem->Send (new NActors::IEventHandle (selfId, selfId, ev));
@@ -299,7 +329,7 @@ namespace NYql::NDq {
299
329
auto ev = new TEvReadSplitsFinished (std::move (result.Status ));
300
330
actorSystem->Send (new NActors::IEventHandle (selfId, selfId, ev));
301
331
} else {
302
- SendError (actorSystem, selfId, result.Status );
332
+ SendRetryOrError (actorSystem, selfId, result.Status , retriesRemaining );
303
333
}
304
334
});
305
335
}
@@ -365,7 +395,18 @@ namespace NYql::NDq {
365
395
new TEvError (std::move (error)));
366
396
}
367
397
368
- static void SendError (NActors::TActorSystem* actorSystem, const NActors::TActorId& selfId, const NYdbGrpc::TGrpcStatus& status) {
398
+ static void SendRetryOrError (NActors::TActorSystem* actorSystem, const NActors::TActorId& selfId, const NYdbGrpc::TGrpcStatus& status, const ui32 retriesRemaining) {
399
+ if (NConnector::GrpcStatusNeedsRetry (status)) {
400
+ if (retriesRemaining) {
401
+ const auto retry = RequestRetriesLimit - retriesRemaining;
402
+ // XXX FIXME tune/tweak
403
+ const auto delay = TDuration::MilliSeconds (1u << retry); // Exponential delay from 1ms to 1s
404
+ YQL_CLOG (WARN, ProviderGeneric) << " ActorId=" << selfId << " Got retrievable GRPC Error from Connector: " << status.ToDebugString () << " , retry " << (retry + 1 ) << " of " << RequestRetriesLimit << " , scheduled in " << delay;
405
+ actorSystem->Schedule (delay, new IEventHandle (selfId, selfId, new TEvRetry ()));
406
+ return ;
407
+ }
408
+ YQL_CLOG (ERROR, ProviderGeneric) << " ActorId=" << selfId << " Got retrievable GRPC Error from Connector: " << status.ToDebugString () << " , retry count exceed limit " << RequestRetriesLimit;
409
+ }
369
410
SendError (actorSystem, selfId, NConnector::ErrorFromGRPCStatus (status));
370
411
}
371
412
@@ -422,6 +463,8 @@ namespace NYql::NDq {
422
463
423
464
NConnector::NApi::TPredicate_TDisjunction disjunction;
424
465
for (const auto & [k, _] : *Request) {
466
+ // TODO consider skipping already retrieved keys
467
+ // ... but careful, can we end up with zero? TODO
425
468
NConnector::NApi::TPredicate_TConjunction conjunction;
426
469
for (ui32 c = 0 ; c != KeyType->GetMembersCount (); ++c) {
427
470
NConnector::NApi::TPredicate_TComparison eq;
@@ -454,6 +497,7 @@ namespace NYql::NDq {
454
497
std::shared_ptr<IDqAsyncLookupSource::TUnboxedValueMap> Request;
455
498
NConnector::IReadSplitsStreamIterator::TPtr ReadSplitsIterator; // TODO move me to TEvReadSplitsPart
456
499
NKikimr::NMiniKQL::TKeyPayloadPairVector LookupResult;
500
+ ui32 RetriesRemaining;
457
501
::NMonitoring::TDynamicCounters::TCounterPtr Count;
458
502
::NMonitoring::TDynamicCounters::TCounterPtr Keys;
459
503
::NMonitoring::TDynamicCounters::TCounterPtr ResultRows;
0 commit comments