@@ -242,6 +242,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
242242 ui64 StorageQuota = 0 ;
243243 ui64 StorageUsage = 0 ;
244244 TMaybeServerlessComputeResourcesMode ServerlessComputeResourcesMode;
245+ TNodeId MaxTimeDifferenceNodeId = 0 ;
245246 TString Path;
246247 };
247248
@@ -566,20 +567,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
566567 return FilterDatabase && FilterDatabase != DomainPath;
567568 }
568569
569- bool IsTimeDifferenceCheckNode (const TNodeId nodeId) const {
570- if (!IsSpecificDatabaseFilter ()) {
571- return true ;
572- }
573-
574- auto it = DatabaseState.find (FilterDatabase);
575- if (it == DatabaseState.end ()) {
576- return false ;
577- }
578- auto & computeNodeIds = it->second .ComputeNodeIds ;
579-
580- return std::find (computeNodeIds.begin (), computeNodeIds.end (), nodeId) != computeNodeIds.end ();
581- }
582-
583570 void Bootstrap () {
584571 FilterDatabase = Request->Database ;
585572 if (Request->Request .operation_params ().has_operation_timeout ()) {
@@ -1450,7 +1437,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
14501437 }
14511438 }
14521439
1453- void FillComputeNodeStatus (TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context, bool reportTimeDifference ) {
1440+ void FillComputeNodeStatus (TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
14541441 FillNodeInfo (nodeId, context.Location .mutable_compute ()->mutable_node ());
14551442
14561443 TSelfCheckContext rrContext (&context, " NODE_UPTIME" );
@@ -1502,17 +1489,15 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
15021489 status = Ydb::Monitoring::StatusFlag::GREEN;
15031490 }
15041491
1505- computeNodeStatus.mutable_max_time_difference ()->set_peer (ToString (peerId));
1506- computeNodeStatus.mutable_max_time_difference ()->set_difference_ms (timeDifferenceDuration.MilliSeconds ());
1507- computeNodeStatus.set_overall (status);
1508-
1509- if (reportTimeDifference) {
1492+ if (databaseState.MaxTimeDifferenceNodeId == nodeId) {
15101493 TSelfCheckContext tdContext (&context, " NODES_TIME_DIFFERENCE" );
1511- FillNodeInfo (peerId, tdContext.Location .mutable_compute ()->mutable_peer ());
15121494 if (status == Ydb::Monitoring::StatusFlag::GREEN) {
15131495 tdContext.ReportStatus (status);
15141496 } else {
1515- tdContext.ReportStatus (status, TStringBuilder () << " The nodes have a time difference of " << timeDifferenceDuration.MilliSeconds () << " ms" , ETags::SyncState);
1497+ tdContext.ReportStatus (status, TStringBuilder () << " Node is "
1498+ << timeDifferenceDuration.MilliSeconds () << " ms "
1499+ << (timeDifferenceUs > 0 ? " behind " : " ahead of " )
1500+ << " peer [" << peerId << " ]" , ETags::SyncState);
15161501 }
15171502 }
15181503 }
@@ -1580,21 +1565,20 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
15801565 if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) {
15811566 context.ReportStatus (systemStatus, " Compute has issues with system tablets" , ETags::ComputeState, {ETags::SystemTabletState});
15821567 }
1583- long maxClockSkewUs = 0 ;
1584- TNodeId maxClockSkewNodeId = 0 ;
1568+ long maxTimeDifferenceUs = 0 ;
15851569 for (TNodeId nodeId : *computeNodeIds) {
15861570 auto itNodeSystemState = MergedNodeSystemState.find (nodeId);
15871571 if (itNodeSystemState != MergedNodeSystemState.end ()) {
15881572 if (std::count (computeNodeIds->begin (), computeNodeIds->end (), itNodeSystemState->second ->GetMaxClockSkewPeerId ()) > 0
1589- && abs (itNodeSystemState->second ->GetMaxClockSkewWithPeerUs ()) > maxClockSkewUs ) {
1590- maxClockSkewUs = abs (itNodeSystemState->second ->GetMaxClockSkewWithPeerUs ());
1591- maxClockSkewNodeId = nodeId;
1573+ && abs (itNodeSystemState->second ->GetMaxClockSkewWithPeerUs ()) > maxTimeDifferenceUs ) {
1574+ maxTimeDifferenceUs = abs (itNodeSystemState->second ->GetMaxClockSkewWithPeerUs ());
1575+ databaseState. MaxTimeDifferenceNodeId = nodeId;
15921576 }
15931577 }
15941578 }
15951579 for (TNodeId nodeId : *computeNodeIds) {
15961580 auto & computeNode = *computeStatus.add_nodes ();
1597- FillComputeNodeStatus (databaseState, nodeId, computeNode, {&context, " COMPUTE_NODE" }, maxClockSkewNodeId == nodeId );
1581+ FillComputeNodeStatus (databaseState, nodeId, computeNode, {&context, " COMPUTE_NODE" });
15981582 }
15991583 FillComputeDatabaseStatus (databaseState, computeStatus, {&context, " COMPUTE_QUOTA" });
16001584 context.ReportWithMaxChildStatus (" Some nodes are restarting too often" , ETags::ComputeState, {ETags::Uptime});
0 commit comments