@@ -235,6 +235,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
235
235
ui64 StorageQuota;
236
236
ui64 StorageUsage;
237
237
TMaybeServerlessComputeResourcesMode ServerlessComputeResourcesMode;
238
+ TNodeId MaxTimeDifferenceNodeId = 0 ;
238
239
};
239
240
240
241
struct TSelfCheckResult {
@@ -813,10 +814,10 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
813
814
ReplyAndPassAway ();
814
815
}
815
816
816
- bool IsStaticNode (const TEvInterconnect::TNodeInfo& nodeInfo ) const {
817
+ bool IsStaticNode (const TNodeId nodeId ) const {
817
818
TAppData* appData = AppData ();
818
819
if (appData->DynamicNameserviceConfig ) {
819
- return nodeInfo. NodeId <= AppData ()->DynamicNameserviceConfig ->MaxStaticNodeId ;
820
+ return nodeId <= AppData ()->DynamicNameserviceConfig ->MaxStaticNodeId ;
820
821
} else {
821
822
return true ;
822
823
}
@@ -827,7 +828,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
827
828
NodesInfo = ev->Release ();
828
829
for (const auto & ni : NodesInfo->Nodes ) {
829
830
MergedNodeInfo[ni.NodeId ] = ∋
830
- if (IsStaticNode (ni) && needComputeFromStaticNodes) {
831
+ if (IsStaticNode (ni. NodeId ) && needComputeFromStaticNodes) {
831
832
DatabaseState[DomainPath].ComputeNodeIds .push_back (ni.NodeId );
832
833
RequestComputeNode (ni.NodeId );
833
834
}
@@ -1251,7 +1252,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
1251
1252
}
1252
1253
}
1253
1254
1254
- void FillComputeNodeStatus (TDatabaseState& databaseState,TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
1255
+ void FillComputeNodeStatus (TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
1255
1256
FillNodeInfo (nodeId, context.Location .mutable_compute ()->mutable_node ());
1256
1257
1257
1258
TSelfCheckContext rrContext (&context, " NODE_UPTIME" );
@@ -1289,6 +1290,32 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
1289
1290
}
1290
1291
loadAverageStatus.set_overall (laContext.GetOverallStatus ());
1291
1292
}
1293
+
1294
+ if (nodeSystemState.HasMaxClockSkewPeerId ()) {
1295
+ TNodeId peerId = nodeSystemState.GetMaxClockSkewPeerId ();
1296
+ long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs ();
1297
+ TDuration timeDifferenceDuration = TDuration::MicroSeconds (abs (timeDifferenceUs));
1298
+ Ydb::Monitoring::StatusFlag::Status status;
1299
+ if (timeDifferenceDuration > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
1300
+ status = Ydb::Monitoring::StatusFlag::ORANGE;
1301
+ } else if (timeDifferenceDuration > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
1302
+ status = Ydb::Monitoring::StatusFlag::YELLOW;
1303
+ } else {
1304
+ status = Ydb::Monitoring::StatusFlag::GREEN;
1305
+ }
1306
+
1307
+ if (databaseState.MaxTimeDifferenceNodeId == nodeId) {
1308
+ TSelfCheckContext tdContext (&context, " NODES_TIME_DIFFERENCE" );
1309
+ if (status == Ydb::Monitoring::StatusFlag::GREEN) {
1310
+ tdContext.ReportStatus (status);
1311
+ } else {
1312
+ tdContext.ReportStatus (status, TStringBuilder () << " Node is "
1313
+ << timeDifferenceDuration.MilliSeconds () << " ms "
1314
+ << (timeDifferenceUs > 0 ? " behind " : " ahead of " )
1315
+ << " peer [" << peerId << " ]" , ETags::SyncState);
1316
+ }
1317
+ }
1318
+ }
1292
1319
} else {
1293
1320
// context.ReportStatus(Ydb::Monitoring::StatusFlag::RED,
1294
1321
// TStringBuilder() << "Compute node is not available",
@@ -1320,12 +1347,24 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
1320
1347
if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) {
1321
1348
context.ReportStatus (systemStatus, " Compute has issues with system tablets" , ETags::ComputeState, {ETags::SystemTabletState});
1322
1349
}
1350
+ long maxTimeDifferenceUs = 0 ;
1351
+ for (TNodeId nodeId : *computeNodeIds) {
1352
+ auto itNodeSystemState = MergedNodeSystemState.find (nodeId);
1353
+ if (itNodeSystemState != MergedNodeSystemState.end ()) {
1354
+ if (std::count (computeNodeIds->begin (), computeNodeIds->end (), itNodeSystemState->second ->GetMaxClockSkewPeerId ()) > 0
1355
+ && abs (itNodeSystemState->second ->GetMaxClockSkewWithPeerUs ()) > maxTimeDifferenceUs) {
1356
+ maxTimeDifferenceUs = abs (itNodeSystemState->second ->GetMaxClockSkewWithPeerUs ());
1357
+ databaseState.MaxTimeDifferenceNodeId = nodeId;
1358
+ }
1359
+ }
1360
+ }
1323
1361
for (TNodeId nodeId : *computeNodeIds) {
1324
1362
auto & computeNode = *computeStatus.add_nodes ();
1325
1363
FillComputeNodeStatus (databaseState, nodeId, computeNode, {&context, " COMPUTE_NODE" });
1326
1364
}
1327
1365
context.ReportWithMaxChildStatus (" Some nodes are restarting too often" , ETags::ComputeState, {ETags::Uptime});
1328
1366
context.ReportWithMaxChildStatus (" Compute is overloaded" , ETags::ComputeState, {ETags::OverloadState});
1367
+ context.ReportWithMaxChildStatus (" Database has time difference between nodes" , ETags::ComputeState, {ETags::SyncState});
1329
1368
Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN;
1330
1369
computeNodeIds->push_back (0 ); // for tablets without node
1331
1370
for (TNodeId nodeId : *computeNodeIds) {
@@ -2072,39 +2111,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
2072
2111
const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25000 );
2073
2112
const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5000 );
2074
2113
2075
- void FillNodesSyncStatus (TOverallStateContext& context) {
2076
- long maxClockSkewUs = 0 ;
2077
- TNodeId maxClockSkewPeerId = 0 ;
2078
- TNodeId maxClockSkewNodeId = 0 ;
2079
- for (auto & [nodeId, nodeSystemState] : MergedNodeSystemState) {
2080
- if (abs (nodeSystemState->GetMaxClockSkewWithPeerUs ()) > maxClockSkewUs) {
2081
- maxClockSkewUs = abs (nodeSystemState->GetMaxClockSkewWithPeerUs ());
2082
- maxClockSkewPeerId = nodeSystemState->GetMaxClockSkewPeerId ();
2083
- maxClockSkewNodeId = nodeId;
2084
- }
2085
- }
2086
- if (!maxClockSkewNodeId) {
2087
- return ;
2088
- }
2089
-
2090
- TSelfCheckResult syncContext;
2091
- syncContext.Type = " NODES_TIME_DIFFERENCE" ;
2092
- FillNodeInfo (maxClockSkewNodeId, syncContext.Location .mutable_node ());
2093
- FillNodeInfo (maxClockSkewPeerId, syncContext.Location .mutable_peer ());
2094
-
2095
- TDuration maxClockSkewTime = TDuration::MicroSeconds (maxClockSkewUs);
2096
- if (maxClockSkewTime > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
2097
- syncContext.ReportStatus (Ydb::Monitoring::StatusFlag::ORANGE, TStringBuilder () << " The nodes have a time difference of " << maxClockSkewTime.MilliSeconds () << " ms" , ETags::SyncState);
2098
- } else if (maxClockSkewTime > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
2099
- syncContext.ReportStatus (Ydb::Monitoring::StatusFlag::YELLOW, TStringBuilder () << " The nodes have a time difference of " << maxClockSkewTime.MilliSeconds () << " ms" , ETags::SyncState);
2100
- } else {
2101
- syncContext.ReportStatus (Ydb::Monitoring::StatusFlag::GREEN);
2102
- }
2103
-
2104
- context.UpdateMaxStatus (syncContext.GetOverallStatus ());
2105
- context.AddIssues (syncContext.IssueRecords );
2106
- }
2107
-
2108
2114
void FillResult (TOverallStateContext context) {
2109
2115
if (IsSpecificDatabaseFilter ()) {
2110
2116
FillDatabaseResult (context, FilterDatabase, DatabaseState[FilterDatabase]);
@@ -2113,7 +2119,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
2113
2119
FillDatabaseResult (context, path, state);
2114
2120
}
2115
2121
}
2116
- FillNodesSyncStatus (context);
2117
2122
if (DatabaseState.empty ()) {
2118
2123
Ydb::Monitoring::DatabaseStatus& databaseStatus (*context.Result ->add_database_status ());
2119
2124
TSelfCheckResult tabletContext;
0 commit comments