@@ -1450,7 +1450,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
1450
1450
}
1451
1451
}
1452
1452
1453
- void FillComputeNodeStatus (TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
1453
+ void FillComputeNodeStatus (TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context, bool reportTimeDifference ) {
1454
1454
FillNodeInfo (nodeId, context.Location .mutable_compute ()->mutable_node ());
1455
1455
1456
1456
TSelfCheckContext rrContext (&context, " NODE_UPTIME" );
@@ -1488,6 +1488,34 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
1488
1488
}
1489
1489
loadAverageStatus.set_overall (laContext.GetOverallStatus ());
1490
1490
}
1491
+
1492
+ {
1493
+ TNodeId peerId = nodeSystemState.GetMaxClockSkewPeerId ();
1494
+ long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs ();
1495
+ TDuration timeDifferenceDuration = TDuration::MicroSeconds (abs (timeDifferenceUs));
1496
+ Ydb::Monitoring::StatusFlag::Status status;
1497
+ if (timeDifferenceDuration > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
1498
+ status = Ydb::Monitoring::StatusFlag::ORANGE;
1499
+ } else if (timeDifferenceDuration > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
1500
+ status = Ydb::Monitoring::StatusFlag::YELLOW;
1501
+ } else {
1502
+ status = Ydb::Monitoring::StatusFlag::GREEN;
1503
+ }
1504
+
1505
+ computeNodeStatus.mutable_max_time_difference ()->set_peer (ToString (peerId));
1506
+ computeNodeStatus.mutable_max_time_difference ()->set_difference_ms (timeDifferenceDuration.MilliSeconds ());
1507
+ computeNodeStatus.set_overall (status);
1508
+
1509
+ if (reportTimeDifference) {
1510
+ TSelfCheckContext tdContext (&context, " NODES_TIME_DIFFERENCE" );
1511
+ FillNodeInfo (peerId, tdContext.Location .mutable_compute ()->mutable_peer ());
1512
+ if (status == Ydb::Monitoring::StatusFlag::GREEN) {
1513
+ tdContext.ReportStatus (status);
1514
+ } else {
1515
+ tdContext.ReportStatus (status, TStringBuilder () << " The nodes have a time difference of " << timeDifferenceDuration.MilliSeconds () << " ms" , ETags::SyncState);
1516
+ }
1517
+ }
1518
+ }
1491
1519
} else {
1492
1520
// context.ReportStatus(Ydb::Monitoring::StatusFlag::RED,
1493
1521
// TStringBuilder() << "Compute node is not available",
@@ -1552,14 +1580,27 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
1552
1580
if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) {
1553
1581
context.ReportStatus (systemStatus, " Compute has issues with system tablets" , ETags::ComputeState, {ETags::SystemTabletState});
1554
1582
}
1583
+ long maxClockSkewUs = 0 ;
1584
+ TNodeId maxClockSkewNodeId = 0 ;
1585
+ for (TNodeId nodeId : *computeNodeIds) {
1586
+ auto itNodeSystemState = MergedNodeSystemState.find (nodeId);
1587
+ if (itNodeSystemState != MergedNodeSystemState.end ()) {
1588
+ if (std::count (computeNodeIds->begin (), computeNodeIds->end (), itNodeSystemState->second ->GetMaxClockSkewPeerId ()) > 0
1589
+ && abs (itNodeSystemState->second ->GetMaxClockSkewWithPeerUs ()) > maxClockSkewUs) {
1590
+ maxClockSkewUs = abs (itNodeSystemState->second ->GetMaxClockSkewWithPeerUs ());
1591
+ maxClockSkewNodeId = nodeId;
1592
+ }
1593
+ }
1594
+ }
1555
1595
for (TNodeId nodeId : *computeNodeIds) {
1556
1596
auto & computeNode = *computeStatus.add_nodes ();
1557
- FillComputeNodeStatus (databaseState, nodeId, computeNode, {&context, " COMPUTE_NODE" });
1597
+ FillComputeNodeStatus (databaseState, nodeId, computeNode, {&context, " COMPUTE_NODE" }, maxClockSkewNodeId == nodeId );
1558
1598
}
1559
1599
FillComputeDatabaseStatus (databaseState, computeStatus, {&context, " COMPUTE_QUOTA" });
1560
1600
context.ReportWithMaxChildStatus (" Some nodes are restarting too often" , ETags::ComputeState, {ETags::Uptime});
1561
1601
context.ReportWithMaxChildStatus (" Compute is overloaded" , ETags::ComputeState, {ETags::OverloadState});
1562
1602
context.ReportWithMaxChildStatus (" Compute quota usage" , ETags::ComputeState, {ETags::QuotaUsage});
1603
+ context.ReportWithMaxChildStatus (" Database has time difference between nodes" , ETags::ComputeState, {ETags::SyncState});
1563
1604
Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN;
1564
1605
computeNodeIds->push_back (0 ); // for tablets without node
1565
1606
for (TNodeId nodeId : *computeNodeIds) {
@@ -2579,17 +2620,14 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
2579
2620
databaseStatus.set_name (path);
2580
2621
FillCompute (state, *databaseStatus.mutable_compute (), {&dbContext, " COMPUTE" });
2581
2622
FillStorage (state, *databaseStatus.mutable_storage (), {&dbContext, " STORAGE" });
2582
- FillTimeDifference (state, *databaseStatus.mutable_time_difference (), {&dbContext, " NODES_TIME_DIFFERENCE" });
2583
2623
if (databaseStatus.compute ().overall () != Ydb::Monitoring::StatusFlag::GREEN
2584
2624
&& databaseStatus.storage ().overall () != Ydb::Monitoring::StatusFlag::GREEN) {
2585
2625
dbContext.ReportStatus (MaxStatus (databaseStatus.compute ().overall (), databaseStatus.storage ().overall ()),
2586
- " Database has multiple issues" , ETags::DBState, { ETags::ComputeState, ETags::StorageState, ETags::SyncState });
2626
+ " Database has multiple issues" , ETags::DBState, { ETags::ComputeState, ETags::StorageState });
2587
2627
} else if (databaseStatus.compute ().overall () != Ydb::Monitoring::StatusFlag::GREEN) {
2588
- dbContext.ReportStatus (databaseStatus.compute ().overall (), " Database has compute issues" , ETags::DBState, {ETags::ComputeState, ETags::SyncState });
2628
+ dbContext.ReportStatus (databaseStatus.compute ().overall (), " Database has compute issues" , ETags::DBState, {ETags::ComputeState});
2589
2629
} else if (databaseStatus.storage ().overall () != Ydb::Monitoring::StatusFlag::GREEN) {
2590
- dbContext.ReportStatus (databaseStatus.storage ().overall (), " Database has storage issues" , ETags::DBState, {ETags::StorageState, ETags::SyncState});
2591
- } else if (databaseStatus.time_difference ().overall () != Ydb::Monitoring::StatusFlag::GREEN) {
2592
- dbContext.ReportStatus (databaseStatus.time_difference ().overall (), " Database has time difference issues" , ETags::DBState, {ETags::SyncState});
2630
+ dbContext.ReportStatus (databaseStatus.storage ().overall (), " Database has storage issues" , ETags::DBState, {ETags::StorageState});
2593
2631
}
2594
2632
databaseStatus.set_overall (dbContext.GetOverallStatus ());
2595
2633
context.UpdateMaxStatus (dbContext.GetOverallStatus ());
@@ -2602,58 +2640,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
2602
2640
const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25000 );
2603
2641
const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5000 );
2604
2642
2605
- void FillTimeDifference (TDatabaseState& databaseState, Ydb::Monitoring::TimeDifferenceStatus& timeDifferenceStatus, TSelfCheckContext context) {
2606
- long maxClockSkewUs = 0 ;
2607
- TNodeId maxClockSkewPeerId = 0 ;
2608
- TNodeId maxClockSkewNodeId = 0 ;
2609
-
2610
- TVector<TNodeId>* computeNodeIds = &databaseState.ComputeNodeIds ;
2611
- if (databaseState.ResourcePathId
2612
- && databaseState.ServerlessComputeResourcesMode != NKikimrSubDomains::EServerlessComputeResourcesModeExclusive)
2613
- {
2614
- auto itDatabase = FilterDomainKey.find (TSubDomainKey (databaseState.ResourcePathId .OwnerId , databaseState.ResourcePathId .LocalPathId ));
2615
- if (itDatabase != FilterDomainKey.end ()) {
2616
- const TString& sharedDatabaseName = itDatabase->second ;
2617
- TDatabaseState& sharedDatabase = DatabaseState[sharedDatabaseName];
2618
- computeNodeIds = &sharedDatabase.ComputeNodeIds ;
2619
- }
2620
- }
2621
-
2622
- for (TNodeId nodeId : *computeNodeIds) {
2623
- auto itNodeSystemState = MergedNodeSystemState.find (nodeId);
2624
- if (itNodeSystemState != MergedNodeSystemState.end ()) {
2625
- if (IsTimeDifferenceCheckNode (nodeId) && IsTimeDifferenceCheckNode (itNodeSystemState->second ->GetMaxClockSkewPeerId ())
2626
- && abs (itNodeSystemState->second ->GetMaxClockSkewWithPeerUs ()) > maxClockSkewUs) {
2627
- maxClockSkewUs = abs (itNodeSystemState->second ->GetMaxClockSkewWithPeerUs ());
2628
- maxClockSkewPeerId = itNodeSystemState->second ->GetMaxClockSkewPeerId ();
2629
- maxClockSkewNodeId = nodeId;
2630
- }
2631
- }
2632
- }
2633
-
2634
- if (!maxClockSkewNodeId) {
2635
- timeDifferenceStatus.set_overall (Ydb::Monitoring::StatusFlag::GREEN);
2636
- return ;
2637
- }
2638
-
2639
- FillNodeInfo (maxClockSkewNodeId, context.Location .mutable_node ());
2640
- FillNodeInfo (maxClockSkewPeerId, context.Location .mutable_peer ());
2641
-
2642
- TDuration maxClockSkewTime = TDuration::MicroSeconds (maxClockSkewUs);
2643
- if (maxClockSkewTime > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
2644
- context.ReportStatus (Ydb::Monitoring::StatusFlag::ORANGE, TStringBuilder () << " The nodes have a time difference of " << maxClockSkewTime.MilliSeconds () << " ms" , ETags::SyncState);
2645
- } else if (maxClockSkewTime > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
2646
- context.ReportStatus (Ydb::Monitoring::StatusFlag::YELLOW, TStringBuilder () << " The nodes have a time difference of " << maxClockSkewTime.MilliSeconds () << " ms" , ETags::SyncState);
2647
- } else {
2648
- context.ReportStatus (Ydb::Monitoring::StatusFlag::GREEN);
2649
- }
2650
-
2651
- timeDifferenceStatus.set_node (ToString (maxClockSkewNodeId));
2652
- timeDifferenceStatus.set_peer (ToString (maxClockSkewPeerId));
2653
- timeDifferenceStatus.set_max_difference_ms (maxClockSkewTime.MilliSeconds ());
2654
- timeDifferenceStatus.set_overall (context.GetOverallStatus ());
2655
- }
2656
-
2657
2643
void FillResult (TOverallStateContext context) {
2658
2644
if (IsSpecificDatabaseFilter ()) {
2659
2645
FillDatabaseResult (context, FilterDatabase, DatabaseState[FilterDatabase]);
0 commit comments