ydb-platform · StekPerepolnen · Jan 18, 2024 · Jan 17, 2024 · Jan 17, 2024 · Jan 18, 2024
@@ -141,6 +141,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
         SystemTabletState,
         OverloadState,
         SyncState,
+        Uptime,
     };
 
     struct TTenantInfo {
@@ -230,6 +231,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
         TVector<TString> StoragePoolNames;
         THashMap<std::pair<TTabletId, NNodeWhiteboard::TFollowerId>, const NKikimrHive::TTabletInfo*> MergedTabletState;
         THashMap<TNodeId, TNodeTabletState> MergedNodeTabletState;
+        THashMap<TNodeId, ui32> NodeRestartsPerPeriod;
         ui64 StorageQuota;
         ui64 StorageUsage;
     };
@@ -1056,6 +1058,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
                             TString path(itFilterDomainKey->second);
                             TDatabaseState& state(DatabaseState[path]);
                             state.ComputeNodeIds.emplace_back(hiveStat.GetNodeId());
+                            state.NodeRestartsPerPeriod[hiveStat.GetNodeId()] = hiveStat.GetRestartsPerPeriod();
                         }
                     }
                 }
@@ -1246,9 +1249,18 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
         }
     }
 
-    void FillComputeNodeStatus(TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
+    void FillComputeNodeStatus(TDatabaseState& databaseState,TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
         FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node());
 
+        TSelfCheckContext rrContext(&context, "NODE_UPTIME");
+        if (databaseState.NodeRestartsPerPeriod[nodeId] >= 30) {
+            rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Node is restarting too often", ETags::Uptime);
+        } else if (databaseState.NodeRestartsPerPeriod[nodeId] >= 10) {
+            rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "The number of node restarts has increased", ETags::Uptime);
+        } else {
+            rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
+        }
+
         auto itNodeSystemState = MergedNodeSystemState.find(nodeId);
         if (itNodeSystemState != MergedNodeSystemState.end()) {
             const NKikimrWhiteboard::TSystemStateInfo& nodeSystemState(*itNodeSystemState->second);
@@ -1306,8 +1318,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
             }
             for (TNodeId nodeId : *computeNodeIds) {
                 auto& computeNode = *computeStatus.add_nodes();
-                FillComputeNodeStatus(nodeId, computeNode, {&context, "COMPUTE_NODE"});
+                FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"});
             }
+            context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::ComputeState, {ETags::Uptime});
             context.ReportWithMaxChildStatus("Compute is overloaded", ETags::ComputeState, {ETags::OverloadState});
             Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN;
             computeNodeIds->push_back(0); // for tablets without node