@@ -1265,7 +1265,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
1265
1265
}
1266
1266
}
1267
1267
1268
- void FillComputeNodeStatus (TDatabaseState& databaseState,TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
1268
+ void FillComputeNodeStatus (TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context, bool reportTimeDifference ) {
1269
1269
FillNodeInfo (nodeId, context.Location .mutable_compute ()->mutable_node ());
1270
1270
1271
1271
TSelfCheckContext rrContext (&context, " NODE_UPTIME" );
@@ -1303,6 +1303,34 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
1303
1303
}
1304
1304
loadAverageStatus.set_overall (laContext.GetOverallStatus ());
1305
1305
}
1306
+
1307
+ if (nodeSystemState.HasMaxClockSkewPeerId ()) {
1308
+ TNodeId peerId = nodeSystemState.GetMaxClockSkewPeerId ();
1309
+ long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs ();
1310
+ TDuration timeDifferenceDuration = TDuration::MicroSeconds (abs (timeDifferenceUs));
1311
+ Ydb::Monitoring::StatusFlag::Status status;
1312
+ if (timeDifferenceDuration > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
1313
+ status = Ydb::Monitoring::StatusFlag::ORANGE;
1314
+ } else if (timeDifferenceDuration > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
1315
+ status = Ydb::Monitoring::StatusFlag::YELLOW;
1316
+ } else {
1317
+ status = Ydb::Monitoring::StatusFlag::GREEN;
1318
+ }
1319
+
1320
+ computeNodeStatus.mutable_max_time_difference ()->set_peer (ToString (peerId));
1321
+ computeNodeStatus.mutable_max_time_difference ()->set_difference_ms (timeDifferenceDuration.MilliSeconds ());
1322
+ computeNodeStatus.set_overall (status);
1323
+
1324
+ if (reportTimeDifference) {
1325
+ TSelfCheckContext tdContext (&context, " NODES_TIME_DIFFERENCE" );
1326
+ FillNodeInfo (peerId, tdContext.Location .mutable_compute ()->mutable_peer ());
1327
+ if (status == Ydb::Monitoring::StatusFlag::GREEN) {
1328
+ tdContext.ReportStatus (status);
1329
+ } else {
1330
+ tdContext.ReportStatus (status, TStringBuilder () << " The nodes have a time difference of " << timeDifferenceDuration.MilliSeconds () << " ms" , ETags::SyncState);
1331
+ }
1332
+ }
1333
+ }
1306
1334
} else {
1307
1335
// context.ReportStatus(Ydb::Monitoring::StatusFlag::RED,
1308
1336
// TStringBuilder() << "Compute node is not available",
@@ -1334,12 +1362,25 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
1334
1362
if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) {
1335
1363
context.ReportStatus (systemStatus, " Compute has issues with system tablets" , ETags::ComputeState, {ETags::SystemTabletState});
1336
1364
}
1365
+ long maxClockSkewUs = 0 ;
1366
+ TNodeId maxClockSkewNodeId = 0 ;
1367
+ for (TNodeId nodeId : *computeNodeIds) {
1368
+ auto itNodeSystemState = MergedNodeSystemState.find (nodeId);
1369
+ if (itNodeSystemState != MergedNodeSystemState.end ()) {
1370
+ if (std::count (computeNodeIds->begin (), computeNodeIds->end (), itNodeSystemState->second ->GetMaxClockSkewPeerId ()) > 0
1371
+ && abs (itNodeSystemState->second ->GetMaxClockSkewWithPeerUs ()) > maxClockSkewUs) {
1372
+ maxClockSkewUs = abs (itNodeSystemState->second ->GetMaxClockSkewWithPeerUs ());
1373
+ maxClockSkewNodeId = nodeId;
1374
+ }
1375
+ }
1376
+ }
1337
1377
for (TNodeId nodeId : *computeNodeIds) {
1338
1378
auto & computeNode = *computeStatus.add_nodes ();
1339
- FillComputeNodeStatus (databaseState, nodeId, computeNode, {&context, " COMPUTE_NODE" });
1379
+ FillComputeNodeStatus (databaseState, nodeId, computeNode, {&context, " COMPUTE_NODE" }, maxClockSkewNodeId == nodeId );
1340
1380
}
1341
1381
context.ReportWithMaxChildStatus (" Some nodes are restarting too often" , ETags::ComputeState, {ETags::Uptime});
1342
1382
context.ReportWithMaxChildStatus (" Compute is overloaded" , ETags::ComputeState, {ETags::OverloadState});
1383
+ context.ReportWithMaxChildStatus (" Database has time difference between nodes" , ETags::ComputeState, {ETags::SyncState});
1343
1384
Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN;
1344
1385
computeNodeIds->push_back (0 ); // for tablets without node
1345
1386
for (TNodeId nodeId : *computeNodeIds) {
@@ -2086,40 +2127,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
2086
2127
const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25000 );
2087
2128
const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5000 );
2088
2129
2089
- void FillNodesSyncStatus (TOverallStateContext& context) {
2090
- long maxClockSkewUs = 0 ;
2091
- TNodeId maxClockSkewPeerId = 0 ;
2092
- TNodeId maxClockSkewNodeId = 0 ;
2093
- for (auto & [nodeId, nodeSystemState] : MergedNodeSystemState) {
2094
- if (IsTimeDifferenceCheckNode (nodeId) && IsTimeDifferenceCheckNode (nodeSystemState->GetMaxClockSkewPeerId ())
2095
- && abs (nodeSystemState->GetMaxClockSkewWithPeerUs ()) > maxClockSkewUs) {
2096
- maxClockSkewUs = abs (nodeSystemState->GetMaxClockSkewWithPeerUs ());
2097
- maxClockSkewPeerId = nodeSystemState->GetMaxClockSkewPeerId ();
2098
- maxClockSkewNodeId = nodeId;
2099
- }
2100
- }
2101
- if (!maxClockSkewNodeId) {
2102
- return ;
2103
- }
2104
-
2105
- TSelfCheckResult syncContext;
2106
- syncContext.Type = " NODES_TIME_DIFFERENCE" ;
2107
- FillNodeInfo (maxClockSkewNodeId, syncContext.Location .mutable_node ());
2108
- FillNodeInfo (maxClockSkewPeerId, syncContext.Location .mutable_peer ());
2109
-
2110
- TDuration maxClockSkewTime = TDuration::MicroSeconds (maxClockSkewUs);
2111
- if (maxClockSkewTime > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
2112
- syncContext.ReportStatus (Ydb::Monitoring::StatusFlag::ORANGE, TStringBuilder () << " The nodes have a time difference of " << maxClockSkewTime.MilliSeconds () << " ms" , ETags::SyncState);
2113
- } else if (maxClockSkewTime > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
2114
- syncContext.ReportStatus (Ydb::Monitoring::StatusFlag::YELLOW, TStringBuilder () << " The nodes have a time difference of " << maxClockSkewTime.MilliSeconds () << " ms" , ETags::SyncState);
2115
- } else {
2116
- syncContext.ReportStatus (Ydb::Monitoring::StatusFlag::GREEN);
2117
- }
2118
-
2119
- context.UpdateMaxStatus (syncContext.GetOverallStatus ());
2120
- context.AddIssues (syncContext.IssueRecords );
2121
- }
2122
-
2123
2130
void FillResult (TOverallStateContext context) {
2124
2131
if (IsSpecificDatabaseFilter ()) {
2125
2132
FillDatabaseResult (context, FilterDatabase, DatabaseState[FilterDatabase]);
@@ -2128,7 +2135,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
2128
2135
FillDatabaseResult (context, path, state);
2129
2136
}
2130
2137
}
2131
- FillNodesSyncStatus (context);
2132
2138
if (DatabaseState.empty ()) {
2133
2139
Ydb::Monitoring::DatabaseStatus& databaseStatus (*context.Result ->add_database_status ());
2134
2140
TSelfCheckResult tabletContext;
0 commit comments