@@ -235,6 +235,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
235
235
ui64 StorageQuota;
236
236
ui64 StorageUsage;
237
237
TMaybeServerlessComputeResourcesMode ServerlessComputeResourcesMode;
238
+ TNodeId MaxTimeDifferenceNodeId = 0 ;
238
239
};
239
240
240
241
struct TSelfCheckResult {
@@ -1265,7 +1266,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
1265
1266
}
1266
1267
}
1267
1268
1268
- void FillComputeNodeStatus (TDatabaseState& databaseState,TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
1269
+ void FillComputeNodeStatus (TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
1269
1270
FillNodeInfo (nodeId, context.Location .mutable_compute ()->mutable_node ());
1270
1271
1271
1272
TSelfCheckContext rrContext (&context, " NODE_UPTIME" );
@@ -1303,6 +1304,34 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
1303
1304
}
1304
1305
loadAverageStatus.set_overall (laContext.GetOverallStatus ());
1305
1306
}
1307
+
1308
+ if (nodeSystemState.HasMaxClockSkewPeerId ()) {
1309
+ TNodeId peerId = nodeSystemState.GetMaxClockSkewPeerId ();
1310
+ long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs ();
1311
+ TDuration timeDifferenceDuration = TDuration::MicroSeconds (abs (timeDifferenceUs));
1312
+ Ydb::Monitoring::StatusFlag::Status status;
1313
+ if (timeDifferenceDuration > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
1314
+ status = Ydb::Monitoring::StatusFlag::ORANGE;
1315
+ } else if (timeDifferenceDuration > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
1316
+ status = Ydb::Monitoring::StatusFlag::YELLOW;
1317
+ } else {
1318
+ status = Ydb::Monitoring::StatusFlag::GREEN;
1319
+ }
1320
+
1321
+ computeNodeStatus.mutable_max_time_difference ()->set_peer (ToString (peerId));
1322
+ computeNodeStatus.mutable_max_time_difference ()->set_difference_ms (timeDifferenceDuration.MilliSeconds ());
1323
+ computeNodeStatus.set_overall (status);
1324
+
1325
+ if (databaseState.MaxTimeDifferenceNodeId == nodeId) {
1326
+ TSelfCheckContext tdContext (&context, " NODES_TIME_DIFFERENCE" );
1327
+ FillNodeInfo (peerId, tdContext.Location .mutable_compute ()->mutable_peer ());
1328
+ if (status == Ydb::Monitoring::StatusFlag::GREEN) {
1329
+ tdContext.ReportStatus (status);
1330
+ } else {
1331
+ tdContext.ReportStatus (status, TStringBuilder () << " The nodes have a time difference of " << timeDifferenceDuration.MilliSeconds () << " ms" , ETags::SyncState);
1332
+ }
1333
+ }
1334
+ }
1306
1335
} else {
1307
1336
// context.ReportStatus(Ydb::Monitoring::StatusFlag::RED,
1308
1337
// TStringBuilder() << "Compute node is not available",
@@ -1334,12 +1363,24 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
1334
1363
if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) {
1335
1364
context.ReportStatus (systemStatus, " Compute has issues with system tablets" , ETags::ComputeState, {ETags::SystemTabletState});
1336
1365
}
1366
+ long maxTimeDifferenceUs = 0 ;
1367
+ for (TNodeId nodeId : *computeNodeIds) {
1368
+ auto itNodeSystemState = MergedNodeSystemState.find (nodeId);
1369
+ if (itNodeSystemState != MergedNodeSystemState.end ()) {
1370
+ if (std::count (computeNodeIds->begin (), computeNodeIds->end (), itNodeSystemState->second ->GetMaxClockSkewPeerId ()) > 0
1371
+ && abs (itNodeSystemState->second ->GetMaxClockSkewWithPeerUs ()) > maxTimeDifferenceUs) {
1372
+ maxTimeDifferenceUs = abs (itNodeSystemState->second ->GetMaxClockSkewWithPeerUs ());
1373
+ databaseState.MaxTimeDifferenceNodeId = nodeId;
1374
+ }
1375
+ }
1376
+ }
1337
1377
for (TNodeId nodeId : *computeNodeIds) {
1338
1378
auto & computeNode = *computeStatus.add_nodes ();
1339
1379
FillComputeNodeStatus (databaseState, nodeId, computeNode, {&context, " COMPUTE_NODE" });
1340
1380
}
1341
1381
context.ReportWithMaxChildStatus (" Some nodes are restarting too often" , ETags::ComputeState, {ETags::Uptime});
1342
1382
context.ReportWithMaxChildStatus (" Compute is overloaded" , ETags::ComputeState, {ETags::OverloadState});
1383
+ context.ReportWithMaxChildStatus (" Database has time difference between nodes" , ETags::ComputeState, {ETags::SyncState});
1343
1384
Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN;
1344
1385
computeNodeIds->push_back (0 ); // for tablets without node
1345
1386
for (TNodeId nodeId : *computeNodeIds) {
@@ -2086,40 +2127,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
2086
2127
const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25000 );
2087
2128
const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5000 );
2088
2129
2089
- void FillNodesSyncStatus (TOverallStateContext& context) {
2090
- long maxClockSkewUs = 0 ;
2091
- TNodeId maxClockSkewPeerId = 0 ;
2092
- TNodeId maxClockSkewNodeId = 0 ;
2093
- for (auto & [nodeId, nodeSystemState] : MergedNodeSystemState) {
2094
- if (IsTimeDifferenceCheckNode (nodeId) && IsTimeDifferenceCheckNode (nodeSystemState->GetMaxClockSkewPeerId ())
2095
- && abs (nodeSystemState->GetMaxClockSkewWithPeerUs ()) > maxClockSkewUs) {
2096
- maxClockSkewUs = abs (nodeSystemState->GetMaxClockSkewWithPeerUs ());
2097
- maxClockSkewPeerId = nodeSystemState->GetMaxClockSkewPeerId ();
2098
- maxClockSkewNodeId = nodeId;
2099
- }
2100
- }
2101
- if (!maxClockSkewNodeId) {
2102
- return ;
2103
- }
2104
-
2105
- TSelfCheckResult syncContext;
2106
- syncContext.Type = " NODES_TIME_DIFFERENCE" ;
2107
- FillNodeInfo (maxClockSkewNodeId, syncContext.Location .mutable_node ());
2108
- FillNodeInfo (maxClockSkewPeerId, syncContext.Location .mutable_peer ());
2109
-
2110
- TDuration maxClockSkewTime = TDuration::MicroSeconds (maxClockSkewUs);
2111
- if (maxClockSkewTime > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
2112
- syncContext.ReportStatus (Ydb::Monitoring::StatusFlag::ORANGE, TStringBuilder () << " The nodes have a time difference of " << maxClockSkewTime.MilliSeconds () << " ms" , ETags::SyncState);
2113
- } else if (maxClockSkewTime > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
2114
- syncContext.ReportStatus (Ydb::Monitoring::StatusFlag::YELLOW, TStringBuilder () << " The nodes have a time difference of " << maxClockSkewTime.MilliSeconds () << " ms" , ETags::SyncState);
2115
- } else {
2116
- syncContext.ReportStatus (Ydb::Monitoring::StatusFlag::GREEN);
2117
- }
2118
-
2119
- context.UpdateMaxStatus (syncContext.GetOverallStatus ());
2120
- context.AddIssues (syncContext.IssueRecords );
2121
- }
2122
-
2123
2130
void FillResult (TOverallStateContext context) {
2124
2131
if (IsSpecificDatabaseFilter ()) {
2125
2132
FillDatabaseResult (context, FilterDatabase, DatabaseState[FilterDatabase]);
@@ -2128,7 +2135,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
2128
2135
FillDatabaseResult (context, path, state);
2129
2136
}
2130
2137
}
2131
- FillNodesSyncStatus (context);
2132
2138
if (DatabaseState.empty ()) {
2133
2139
Ydb::Monitoring::DatabaseStatus& databaseStatus (*context.Result ->add_database_status ());
2134
2140
TSelfCheckResult tabletContext;
0 commit comments