-
Notifications
You must be signed in to change notification settings - Fork 3.4k
[enhancement](cloud) improve the retry policy of cloud mode #49067
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,7 @@ | |
#include <random> | ||
#include <type_traits> | ||
|
||
#include "common/bvars.h" | ||
#include "common/config.h" | ||
#include "cpp/sync_point.h" | ||
#include "meta-service/txn_kv.h" | ||
|
@@ -711,6 +712,30 @@ class MetaServiceProxy final : public MetaService { | |
using MetaServiceMethod = void (cloud::MetaService::*)(::google::protobuf::RpcController*, | ||
const Request*, Response*, | ||
::google::protobuf::Closure*); | ||
int64_t get_fdb_client_thread_busyness_percent() { | ||
//auto now = steady_clock::now(); | ||
auto now = std::chrono::steady_clock::now(); | ||
auto duration_s = | ||
duration_cast<std::chrono::seconds>(now - buyness_last_update_time_).count(); | ||
if (duration_s > config::bvar_qps_update_second) { | ||
cache_buyness_percent_ = g_bvar_fdb_client_thread_busyness_percent.get_value(); | ||
buyness_last_update_time_ = now; | ||
} | ||
return cache_buyness_percent_; | ||
} | ||
|
||
int get_dynamic_retry_count() { | ||
int64_t busyness_percent = get_fdb_client_thread_busyness_percent(); | ||
if (busyness_percent > config::retry_disable_busyness_threshold) { | ||
g_bvar_busynesss_disable_counter << 1; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. bvar adder should do |
||
return 0; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add bvar to record reduce and disable count There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
} else if (busyness_percent > config::retry_reduce_busyness_threshold) { | ||
g_bvar_busynesss_reduced_counter << 1; | ||
return config::busyness_reduced_retry_times; | ||
} else { | ||
return config::txn_store_retry_times; | ||
} | ||
} | ||
|
||
template <typename Request, typename Response> | ||
void call_impl(MetaServiceMethod<Request, Response> method, | ||
|
@@ -761,7 +786,8 @@ class MetaServiceProxy final : public MetaService { | |
0, config::txn_store_retry_base_intervals_ms)(rng); | ||
} | ||
|
||
if (retry_times >= config::txn_store_retry_times || | ||
int dynamic_max_retry_cnt = get_dynamic_retry_count(); | ||
if (retry_times >= dynamic_max_retry_cnt || | ||
// Retrying KV_TXN_TOO_OLD is very expensive, so we only retry once. | ||
(retry_times > 1 && code == MetaServiceCode::KV_TXN_TOO_OLD)) { | ||
// For KV_TXN_CONFLICT, we should return KV_TXN_CONFLICT_RETRY_EXCEEDED_MAX_TIMES, | ||
|
@@ -771,7 +797,10 @@ class MetaServiceProxy final : public MetaService { | |
: code == MetaServiceCode::KV_TXN_STORE_GET_RETRYABLE ? KV_TXN_GET_ERR | ||
: code == MetaServiceCode::KV_TXN_STORE_CREATE_RETRYABLE ? KV_TXN_CREATE_ERR | ||
: code == MetaServiceCode::KV_TXN_CONFLICT | ||
? KV_TXN_CONFLICT_RETRY_EXCEEDED_MAX_TIMES | ||
? get_fdb_client_thread_busyness_percent() > | ||
config::retry_disable_busyness_threshold | ||
? MetaServiceCode::KV_TXN_CONFLICT_BUSY | ||
: KV_TXN_CONFLICT_RETRY_EXCEEDED_MAX_TIMES | ||
: MetaServiceCode::KV_TXN_TOO_OLD); | ||
return; | ||
} | ||
|
@@ -784,14 +813,18 @@ class MetaServiceProxy final : public MetaService { | |
retry_times += 1; | ||
LOG(WARNING) << __PRETTY_FUNCTION__ << " sleep " << duration_ms | ||
<< " ms before next round, retry times left: " | ||
<< (config::txn_store_retry_times - retry_times) | ||
<< (dynamic_max_retry_cnt - retry_times) | ||
<< ", max retry count: " << dynamic_max_retry_cnt | ||
<< ", code: " << MetaServiceCode_Name(code) | ||
<< ", msg: " << resp->status().msg(); | ||
bthread_usleep(duration_ms * 1000); | ||
} | ||
} | ||
|
||
std::unique_ptr<MetaServiceImpl> impl_; | ||
std::chrono::steady_clock::time_point buyness_last_update_time_ = | ||
std::chrono::steady_clock::now() - std::chrono::seconds(100); | ||
int32_t cache_buyness_percent_ = 0; | ||
}; | ||
|
||
} // namespace doris::cloud |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add comments