Skip to content

Commit a663ef7

Browse files
authored
Merge 46d4644 into 191c2a6
2 parents 191c2a6 + 46d4644 commit a663ef7

File tree

5 files changed

+24
-7
lines changed

5 files changed

+24
-7
lines changed

ydb/core/fq/libs/config/protos/control_plane_storage.proto

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,15 @@ message TQueryMapping {
2424

2525
// 1. StatusCode(s) are handled with defined policies, non-unique StatusCode(s) across all policies is UB
2626
// 2. RetryCount and RetryPeriodMs are used to calculate actual RetryRate, if it exceeds RetryCount, query is aborted
27+
// - Number of retries during RetryPeriod time less than 2 * RetryCount due to RetryRate
2728
// 3. BackoffPeriodMs is factor of RetryRate to delay query execution before next retry
28-
// 4. There are no default retry policy, all unhandled statuses are fatal
29+
// 4. RetryLimit is hard limit for amount query retry count, after that query is aborted
30+
// - If RetryLimit = 0, query can be abborted only by RetryRate
31+
// 5. There are no default retry policy, all unhandled statuses are fatal
2932

3033
message TRetryPolicy {
3134
uint64 RetryCount = 1;
35+
uint64 RetryLimit = 4;
3236
string RetryPeriod = 2;
3337
string BackoffPeriod = 3;
3438
}

ydb/core/fq/libs/control_plane_storage/config.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,11 @@ TControlPlaneStorageConfig::TControlPlaneStorageConfig(const NConfig::TControlPl
5050
for (const auto& mapping : Proto.GetRetryPolicyMapping()) {
5151
auto& retryPolicy = mapping.GetPolicy();
5252
auto retryCount = retryPolicy.GetRetryCount();
53+
auto retryLimit = retryPolicy.GetRetryLimit();
5354
auto retryPeriod = GetDuration(retryPolicy.GetRetryPeriod(), TDuration::Hours(1));
5455
auto backoffPeriod = GetDuration(retryPolicy.GetBackoffPeriod(), TDuration::Zero());
5556
for (const auto statusCode: mapping.GetStatusCode()) {
56-
RetryPolicies.emplace(statusCode, TRetryPolicyItem(retryCount, retryPeriod, backoffPeriod));
57+
RetryPolicies.emplace(statusCode, TRetryPolicyItem(retryCount, retryLimit, retryPeriod, backoffPeriod));
5758
}
5859
}
5960

ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ TPingTaskParams ConstructHardPingTask(
173173
internal.clear_operation_id();
174174
}
175175

176-
TRetryPolicyItem policy(0, TDuration::Seconds(1), TDuration::Zero());
176+
TRetryPolicyItem policy(0, 0, TDuration::Seconds(1), TDuration::Zero());
177177
auto it = retryPolicies.find(request.status_code());
178178
auto policyFound = it != retryPolicies.end();
179179
if (policyFound) {
@@ -200,7 +200,7 @@ TPingTaskParams ConstructHardPingTask(
200200
TStringBuilder builder;
201201
builder << "Query failed with code " << NYql::NDqProto::StatusIds_StatusCode_Name(request.status_code());
202202
if (policy.RetryCount) {
203-
builder << " (failure rate " << retryLimiter.RetryRate << " exceeds limit of " << policy.RetryCount << ")";
203+
builder << " (" << retryLimiter.LastError << ")";
204204
}
205205
builder << " at " << Now();
206206

ydb/core/fq/libs/control_plane_storage/util.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,16 @@ bool TRetryLimiter::UpdateOnRetry(const TInstant& lastSeenAt, const TRetryPolicy
2828
RetryRate = 0.0;
2929
}
3030
}
31-
bool shouldRetry = RetryRate < policy.RetryCount;
31+
32+
bool shouldRetry = true;
33+
if (RetryRate >= policy.RetryCount) {
34+
shouldRetry = false;
35+
LastError = TStringBuilder() << "failure rate " << RetryRate << " exceeds limit of " << policy.RetryCount;
36+
} else if (policy.RetryLimit && RetryCount >= policy.RetryLimit) {
37+
shouldRetry = false;
38+
LastError = TStringBuilder() << "retry count reached limit of " << policy.RetryLimit;
39+
}
40+
3241
if (shouldRetry) {
3342
RetryCount++;
3443
RetryCounterUpdatedAt = now;
@@ -145,6 +154,7 @@ NConfig::TControlPlaneStorageConfig FillDefaultParameters(NConfig::TControlPlane
145154
policyMapping.AddStatusCode(NYql::NDqProto::StatusIds::EXTERNAL_ERROR);
146155
auto& policy = *policyMapping.MutablePolicy();
147156
policy.SetRetryCount(10);
157+
policy.SetRetryLimit(40);
148158
policy.SetRetryPeriod("1m");
149159
policy.SetBackoffPeriod("1s");
150160
}

ydb/core/fq/libs/control_plane_storage/util.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,11 @@ namespace NFq {
1515
class TRetryPolicyItem {
1616
public:
1717
TRetryPolicyItem() = default;
18-
TRetryPolicyItem(ui64 retryCount, const TDuration& retryPeriod, const TDuration& backoffPeriod)
19-
: RetryCount(retryCount), RetryPeriod(retryPeriod), BackoffPeriod(backoffPeriod)
18+
TRetryPolicyItem(ui64 retryCount, ui64 retryLimit, const TDuration& retryPeriod, const TDuration& backoffPeriod)
19+
: RetryCount(retryCount), RetryLimit(retryLimit), RetryPeriod(retryPeriod), BackoffPeriod(backoffPeriod)
2020
{ }
2121
ui64 RetryCount = 0;
22+
ui64 RetryLimit = 0;
2223
TDuration RetryPeriod = TDuration::Zero();
2324
TDuration BackoffPeriod = TDuration::Zero();
2425
};
@@ -32,6 +33,7 @@ class TRetryLimiter {
3233
ui64 RetryCount = 0;
3334
TInstant RetryCounterUpdatedAt = TInstant::Zero();
3435
double RetryRate = 0.0;
36+
TString LastError;
3537
};
3638

3739
bool IsTerminalStatus(FederatedQuery::QueryMeta::ComputeStatus status);

0 commit comments

Comments
 (0)