@@ -28,6 +28,7 @@ namespace NYql::NDq {
28
28
using namespace NActors ;
29
29
30
30
namespace {
31
+ constexpr ui32 RequestRetriesLimit = 10 ; // TODO lookup parameters or PRAGMA?
31
32
32
33
const NKikimr::NMiniKQL::TStructType* MergeStructTypes (const NKikimr::NMiniKQL::TTypeEnvironment& env, const NKikimr::NMiniKQL::TStructType* t1, const NKikimr::NMiniKQL::TStructType* t2) {
33
34
Y_ABORT_UNLESS (t1);
@@ -45,7 +46,7 @@ namespace NYql::NDq {
45
46
template <typename T>
46
47
T ExtractFromConstFuture (const NThreading::TFuture<T>& f) {
47
48
// We want to avoid making a copy of data stored in a future.
48
- // But there is no direct way to extract data from a const future5
49
+ // But there is no direct way to extract data from a const future
49
50
// So, we make a copy of the future, that is cheap. Then, extract the value from this copy.
50
51
// It destructs the value in the original future, but this trick is legal and documented here:
51
52
// https://docs.yandex-team.ru/arcadia-cpp/cookbook/concurrency
@@ -150,20 +151,25 @@ namespace NYql::NDq {
150
151
hFunc (TEvReadSplitsPart, Handle);
151
152
hFunc (TEvReadSplitsFinished, Handle);
152
153
hFunc (TEvError, Handle);
154
+ hFunc (TEvRetry, Handle);
153
155
hFunc (NActors::TEvents::TEvPoison, Handle);)
154
156
155
157
void Handle (TEvListSplitsIterator::TPtr ev) {
156
158
auto & iterator = ev->Get ()->Iterator ;
157
159
iterator->ReadNext ().Subscribe (
158
- [actorSystem = TActivationContext::ActorSystem (), selfId = SelfId ()](const NConnector::TAsyncResult<NConnector::NApi::TListSplitsResponse>& asyncResult) {
160
+ [
161
+ actorSystem = TActivationContext::ActorSystem (),
162
+ selfId = SelfId (),
163
+ retriesRemaining = RetriesRemaining
164
+ ](const NConnector::TAsyncResult<NConnector::NApi::TListSplitsResponse>& asyncResult) {
159
165
YQL_CLOG (DEBUG, ProviderGeneric) << " ActorId=" << selfId << " Got TListSplitsResponse from Connector" ;
160
166
auto result = ExtractFromConstFuture (asyncResult);
161
167
if (result.Status .Ok ()) {
162
168
Y_ABORT_UNLESS (result.Response );
163
169
auto ev = new TEvListSplitsPart (std::move (*result.Response ));
164
170
actorSystem->Send (new NActors::IEventHandle (selfId, selfId, ev));
165
171
} else {
166
- SendError (actorSystem, selfId, result.Status );
172
+ SendRetryOrError (actorSystem, selfId, result.Status , retriesRemaining );
167
173
}
168
174
});
169
175
}
@@ -184,14 +190,18 @@ namespace NYql::NDq {
184
190
*readRequest.add_splits () = split;
185
191
readRequest.Setformat (NConnector::NApi::TReadSplitsRequest_EFormat::TReadSplitsRequest_EFormat_ARROW_IPC_STREAMING);
186
192
readRequest.set_filtering (NConnector::NApi::TReadSplitsRequest::FILTERING_MANDATORY);
187
- Connector->ReadSplits (readRequest).Subscribe ([actorSystem = TActivationContext::ActorSystem (), selfId = SelfId ()](const NConnector::TReadSplitsStreamIteratorAsyncResult& asyncResult) {
193
+ Connector->ReadSplits (readRequest).Subscribe ([
194
+ actorSystem = TActivationContext::ActorSystem (),
195
+ selfId = SelfId (),
196
+ retriesRemaining = RetriesRemaining
197
+ ](const NConnector::TReadSplitsStreamIteratorAsyncResult& asyncResult) {
188
198
YQL_CLOG (DEBUG, ProviderGeneric) << " ActorId=" << selfId << " Got ReadSplitsStreamIterator from Connector" ;
189
199
auto result = ExtractFromConstFuture (asyncResult);
190
200
if (result.Status .Ok ()) {
191
201
auto ev = new TEvReadSplitsIterator (std::move (result.Iterator ));
192
202
actorSystem->Send (new NActors::IEventHandle (selfId, selfId, ev));
193
203
} else {
194
- SendError (actorSystem, selfId, result.Status );
204
+ SendRetryOrError (actorSystem, selfId, result.Status , retriesRemaining );
195
205
}
196
206
});
197
207
}
@@ -220,6 +230,13 @@ namespace NYql::NDq {
220
230
actorSystem->Send (new NActors::IEventHandle (ParentId, SelfId (), errEv.release ()));
221
231
}
222
232
233
+ void Handle (TEvRetry::TPtr ev) {
234
+ auto guard = Guard (*Alloc);
235
+ Y_ENSURE (RetriesRemaining > 0 );
236
+ --RetriesRemaining;
237
+ SendRequest ();
238
+ }
239
+
223
240
void Handle (NActors::TEvents::TEvPoison::TPtr) {
224
241
PassAway ();
225
242
}
@@ -238,17 +255,21 @@ namespace NYql::NDq {
238
255
if (!request) {
239
256
return ;
240
257
}
241
- auto startCycleCount = GetCycleCountFast ();
242
258
SentTime = TInstant::Now ();
243
259
YQL_CLOG (DEBUG, ProviderGeneric) << " ActorId=" << SelfId () << " Got LookupRequest for " << request->size () << " keys" ;
244
260
Y_ABORT_IF (request->size () == 0 || request->size () > MaxKeysInRequest);
245
-
246
261
if (Count) {
247
262
Count->Inc ();
248
263
Keys->Add (request->size ());
249
264
}
250
265
251
266
Request = std::move (request);
267
+ RetriesRemaining = RequestRetriesLimit;
268
+ SendRequest ();
269
+ }
270
+
271
+ void SendRequest () {
272
+ auto startCycleCount = GetCycleCountFast ();
252
273
NConnector::NApi::TListSplitsRequest splitRequest;
253
274
254
275
auto error = FillSelect (*splitRequest.add_selects ());
@@ -258,15 +279,19 @@ namespace NYql::NDq {
258
279
};
259
280
260
281
splitRequest.Setmax_split_count (1 );
261
- Connector->ListSplits (splitRequest).Subscribe ([actorSystem = TActivationContext::ActorSystem (), selfId = SelfId ()](const NConnector::TListSplitsStreamIteratorAsyncResult& asyncResult) {
282
+ Connector->ListSplits (splitRequest).Subscribe ([
283
+ actorSystem = TActivationContext::ActorSystem (),
284
+ selfId = SelfId (),
285
+ retriesRemaining = RetriesRemaining
286
+ ](const NConnector::TListSplitsStreamIteratorAsyncResult& asyncResult) {
262
287
auto result = ExtractFromConstFuture (asyncResult);
263
288
if (result.Status .Ok ()) {
264
289
YQL_CLOG (DEBUG, ProviderGeneric) << " ActorId=" << selfId << " Got TListSplitsStreamIterator" ;
265
290
Y_ABORT_UNLESS (result.Iterator , " Uninitialized iterator" );
266
291
auto ev = new TEvListSplitsIterator (std::move (result.Iterator ));
267
292
actorSystem->Send (new NActors::IEventHandle (selfId, selfId, ev));
268
293
} else {
269
- SendError (actorSystem, selfId, result.Status );
294
+ SendRetryOrError (actorSystem, selfId, result.Status , retriesRemaining );
270
295
}
271
296
});
272
297
if (CpuTime) {
@@ -276,12 +301,17 @@ namespace NYql::NDq {
276
301
277
302
void ReadNextData () {
278
303
ReadSplitsIterator->ReadNext ().Subscribe (
279
- [actorSystem = TActivationContext::ActorSystem (), selfId = SelfId ()](const NConnector::TAsyncResult<NConnector::NApi::TReadSplitsResponse>& asyncResult) {
304
+ [
305
+ actorSystem = TActivationContext::ActorSystem (),
306
+ selfId = SelfId (),
307
+ retriesRemaining = RetriesRemaining
308
+ ](const NConnector::TAsyncResult<NConnector::NApi::TReadSplitsResponse>& asyncResult) {
280
309
auto result = ExtractFromConstFuture (asyncResult);
281
310
if (result.Status .Ok ()) {
282
311
YQL_CLOG (DEBUG, ProviderGeneric) << " ActorId=" << selfId << " Got DataChunk" ;
283
312
Y_ABORT_UNLESS (result.Response );
284
313
auto & response = *result.Response ;
314
+ // TODO: retry on some YDB errors
285
315
if (NConnector::IsSuccess (response)) {
286
316
auto ev = new TEvReadSplitsPart (std::move (response));
287
317
actorSystem->Send (new NActors::IEventHandle (selfId, selfId, ev));
@@ -293,7 +323,7 @@ namespace NYql::NDq {
293
323
auto ev = new TEvReadSplitsFinished (std::move (result.Status ));
294
324
actorSystem->Send (new NActors::IEventHandle (selfId, selfId, ev));
295
325
} else {
296
- SendError (actorSystem, selfId, result.Status );
326
+ SendRetryOrError (actorSystem, selfId, result.Status , retriesRemaining );
297
327
}
298
328
});
299
329
}
@@ -358,7 +388,18 @@ namespace NYql::NDq {
358
388
new TEvError (std::move (error)));
359
389
}
360
390
361
- static void SendError (NActors::TActorSystem* actorSystem, const NActors::TActorId& selfId, const NYdbGrpc::TGrpcStatus& status) {
391
+ static void SendRetryOrError (NActors::TActorSystem* actorSystem, const NActors::TActorId& selfId, const NYdbGrpc::TGrpcStatus& status, const ui32 retriesRemaining) {
392
+ if (NConnector::GrpcStatusNeedsRetry (status)) {
393
+ if (retriesRemaining) {
394
+ const auto retry = RequestRetriesLimit - retriesRemaining;
395
+ // XXX FIXME tune/tweak
396
+ const auto delay = TDuration::MilliSeconds (1u << retry); // Exponential delay from 1ms to 1s
397
+ YQL_CLOG (WARN, ProviderGeneric) << " ActorId=" << selfId << " Got retrievable GRPC Error from Connector: " << status.ToDebugString () << " , retry " << (retry + 1 ) << " of " << RequestRetriesLimit << " , scheduled in " << delay;
398
+ actorSystem->Schedule (delay, new IEventHandle (selfId, selfId, new TEvRetry ()));
399
+ return ;
400
+ }
401
+ YQL_CLOG (ERROR, ProviderGeneric) << " ActorId=" << selfId << " Got retrievable GRPC Error from Connector: " << status.ToDebugString () << " , retry count exceed limit " << RequestRetriesLimit;
402
+ }
362
403
SendError (actorSystem, selfId, NConnector::ErrorFromGRPCStatus (status));
363
404
}
364
405
@@ -415,6 +456,8 @@ namespace NYql::NDq {
415
456
416
457
NConnector::NApi::TPredicate_TDisjunction disjunction;
417
458
for (const auto & [k, _] : *Request) {
459
+ // TODO consider skipping already retrieved keys
460
+ // ... but careful, can we end up with zero? TODO
418
461
NConnector::NApi::TPredicate_TConjunction conjunction;
419
462
for (ui32 c = 0 ; c != KeyType->GetMembersCount (); ++c) {
420
463
NConnector::NApi::TPredicate_TComparison eq;
@@ -447,6 +490,7 @@ namespace NYql::NDq {
447
490
std::shared_ptr<IDqAsyncLookupSource::TUnboxedValueMap> Request;
448
491
NConnector::IReadSplitsStreamIterator::TPtr ReadSplitsIterator; // TODO move me to TEvReadSplitsPart
449
492
NKikimr::NMiniKQL::TKeyPayloadPairVector LookupResult;
493
+ ui32 RetriesRemaining;
450
494
::NMonitoring::TDynamicCounters::TCounterPtr Count;
451
495
::NMonitoring::TDynamicCounters::TCounterPtr Keys;
452
496
::NMonitoring::TDynamicCounters::TCounterPtr ResultRows;
0 commit comments