Skip to content

Commit da3251f

Browse files
move time difference issue under database level (#5859)
1 parent 81d4eb3 commit da3251f

File tree

2 files changed

+50
-36
lines changed

2 files changed

+50
-36
lines changed

ydb/core/health_check/health_check.cpp

Lines changed: 42 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
235235
ui64 StorageQuota;
236236
ui64 StorageUsage;
237237
TMaybeServerlessComputeResourcesMode ServerlessComputeResourcesMode;
238+
TNodeId MaxTimeDifferenceNodeId = 0;
238239
};
239240

240241
struct TSelfCheckResult {
@@ -1265,7 +1266,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
12651266
}
12661267
}
12671268

1268-
void FillComputeNodeStatus(TDatabaseState& databaseState,TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
1269+
void FillComputeNodeStatus(TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
12691270
FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node());
12701271

12711272
TSelfCheckContext rrContext(&context, "NODE_UPTIME");
@@ -1303,6 +1304,34 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
13031304
}
13041305
loadAverageStatus.set_overall(laContext.GetOverallStatus());
13051306
}
1307+
1308+
if (nodeSystemState.HasMaxClockSkewPeerId()) {
1309+
TNodeId peerId = nodeSystemState.GetMaxClockSkewPeerId();
1310+
long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs();
1311+
TDuration timeDifferenceDuration = TDuration::MicroSeconds(abs(timeDifferenceUs));
1312+
Ydb::Monitoring::StatusFlag::Status status;
1313+
if (timeDifferenceDuration > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
1314+
status = Ydb::Monitoring::StatusFlag::ORANGE;
1315+
} else if (timeDifferenceDuration > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
1316+
status = Ydb::Monitoring::StatusFlag::YELLOW;
1317+
} else {
1318+
status = Ydb::Monitoring::StatusFlag::GREEN;
1319+
}
1320+
1321+
computeNodeStatus.mutable_max_time_difference()->set_peer(ToString(peerId));
1322+
computeNodeStatus.mutable_max_time_difference()->set_difference_ms(timeDifferenceDuration.MilliSeconds());
1323+
computeNodeStatus.set_overall(status);
1324+
1325+
if (databaseState.MaxTimeDifferenceNodeId == nodeId) {
1326+
TSelfCheckContext tdContext(&context, "NODES_TIME_DIFFERENCE");
1327+
FillNodeInfo(peerId, tdContext.Location.mutable_compute()->mutable_peer());
1328+
if (status == Ydb::Monitoring::StatusFlag::GREEN) {
1329+
tdContext.ReportStatus(status);
1330+
} else {
1331+
tdContext.ReportStatus(status, TStringBuilder() << "The nodes have a time difference of " << timeDifferenceDuration.MilliSeconds() << " ms", ETags::SyncState);
1332+
}
1333+
}
1334+
}
13061335
} else {
13071336
// context.ReportStatus(Ydb::Monitoring::StatusFlag::RED,
13081337
// TStringBuilder() << "Compute node is not available",
@@ -1334,12 +1363,24 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
13341363
if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) {
13351364
context.ReportStatus(systemStatus, "Compute has issues with system tablets", ETags::ComputeState, {ETags::SystemTabletState});
13361365
}
1366+
long maxTimeDifferenceUs = 0;
1367+
for (TNodeId nodeId : *computeNodeIds) {
1368+
auto itNodeSystemState = MergedNodeSystemState.find(nodeId);
1369+
if (itNodeSystemState != MergedNodeSystemState.end()) {
1370+
if (std::count(computeNodeIds->begin(), computeNodeIds->end(), itNodeSystemState->second->GetMaxClockSkewPeerId()) > 0
1371+
&& abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()) > maxTimeDifferenceUs) {
1372+
maxTimeDifferenceUs = abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs());
1373+
databaseState.MaxTimeDifferenceNodeId = nodeId;
1374+
}
1375+
}
1376+
}
13371377
for (TNodeId nodeId : *computeNodeIds) {
13381378
auto& computeNode = *computeStatus.add_nodes();
13391379
FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"});
13401380
}
13411381
context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::ComputeState, {ETags::Uptime});
13421382
context.ReportWithMaxChildStatus("Compute is overloaded", ETags::ComputeState, {ETags::OverloadState});
1383+
context.ReportWithMaxChildStatus("Database has time difference between nodes", ETags::ComputeState, {ETags::SyncState});
13431384
Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN;
13441385
computeNodeIds->push_back(0); // for tablets without node
13451386
for (TNodeId nodeId : *computeNodeIds) {
@@ -2086,40 +2127,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
20862127
const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25000);
20872128
const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5000);
20882129

2089-
void FillNodesSyncStatus(TOverallStateContext& context) {
2090-
long maxClockSkewUs = 0;
2091-
TNodeId maxClockSkewPeerId = 0;
2092-
TNodeId maxClockSkewNodeId = 0;
2093-
for (auto& [nodeId, nodeSystemState] : MergedNodeSystemState) {
2094-
if (IsTimeDifferenceCheckNode(nodeId) && IsTimeDifferenceCheckNode(nodeSystemState->GetMaxClockSkewPeerId())
2095-
&& abs(nodeSystemState->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) {
2096-
maxClockSkewUs = abs(nodeSystemState->GetMaxClockSkewWithPeerUs());
2097-
maxClockSkewPeerId = nodeSystemState->GetMaxClockSkewPeerId();
2098-
maxClockSkewNodeId = nodeId;
2099-
}
2100-
}
2101-
if (!maxClockSkewNodeId) {
2102-
return;
2103-
}
2104-
2105-
TSelfCheckResult syncContext;
2106-
syncContext.Type = "NODES_TIME_DIFFERENCE";
2107-
FillNodeInfo(maxClockSkewNodeId, syncContext.Location.mutable_node());
2108-
FillNodeInfo(maxClockSkewPeerId, syncContext.Location.mutable_peer());
2109-
2110-
TDuration maxClockSkewTime = TDuration::MicroSeconds(maxClockSkewUs);
2111-
if (maxClockSkewTime > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
2112-
syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState);
2113-
} else if (maxClockSkewTime > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
2114-
syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState);
2115-
} else {
2116-
syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
2117-
}
2118-
2119-
context.UpdateMaxStatus(syncContext.GetOverallStatus());
2120-
context.AddIssues(syncContext.IssueRecords);
2121-
}
2122-
21232130
void FillResult(TOverallStateContext context) {
21242131
if (IsSpecificDatabaseFilter()) {
21252132
FillDatabaseResult(context, FilterDatabase, DatabaseState[FilterDatabase]);
@@ -2128,7 +2135,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
21282135
FillDatabaseResult(context, path, state);
21292136
}
21302137
}
2131-
FillNodesSyncStatus(context);
21322138
if (DatabaseState.empty()) {
21332139
Ydb::Monitoring::DatabaseStatus& databaseStatus(*context.Result->add_database_status());
21342140
TSelfCheckResult tabletContext;

ydb/public/api/protos/ydb_monitoring.proto

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,12 +106,19 @@ message LoadAverageStatus {
106106
uint32 cores = 3;
107107
}
108108

109+
message TimeDifferenceStatus {
110+
StatusFlag.Status overall = 1;
111+
int64 difference_ms = 2;
112+
string peer = 3;
113+
}
114+
109115
message ComputeNodeStatus {
110116
string id = 1;
111117
StatusFlag.Status overall = 2;
112118
repeated ComputeTabletStatus tablets = 3;
113119
repeated ThreadPoolStatus pools = 4;
114120
LoadAverageStatus load = 5;
121+
TimeDifferenceStatus max_time_difference = 6;
115122
}
116123

117124
message ComputeStatus {
@@ -165,6 +172,7 @@ message LocationCompute {
165172
LocationNode node = 1;
166173
LocationComputePool pool = 2;
167174
LocationComputeTablet tablet = 3;
175+
LocationNode peer = 4;
168176
}
169177

170178
message LocationDatabase {

0 commit comments

Comments
 (0)