Skip to content

Commit 945f674

Browse files
Merge 891a1de into 851759e
2 parents 851759e + 891a1de commit 945f674

File tree

2 files changed

+12
-36
lines changed

2 files changed

+12
-36
lines changed

ydb/core/health_check/health_check.cpp

Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
242242
ui64 StorageQuota = 0;
243243
ui64 StorageUsage = 0;
244244
TMaybeServerlessComputeResourcesMode ServerlessComputeResourcesMode;
245+
TNodeId MaxTimeDifferenceNodeId = 0;
245246
TString Path;
246247
};
247248

@@ -566,20 +567,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
566567
return FilterDatabase && FilterDatabase != DomainPath;
567568
}
568569

569-
bool IsTimeDifferenceCheckNode(const TNodeId nodeId) const {
570-
if (!IsSpecificDatabaseFilter()) {
571-
return true;
572-
}
573-
574-
auto it = DatabaseState.find(FilterDatabase);
575-
if (it == DatabaseState.end()) {
576-
return false;
577-
}
578-
auto& computeNodeIds = it->second.ComputeNodeIds;
579-
580-
return std::find(computeNodeIds.begin(), computeNodeIds.end(), nodeId) != computeNodeIds.end();
581-
}
582-
583570
void Bootstrap() {
584571
FilterDatabase = Request->Database;
585572
if (Request->Request.operation_params().has_operation_timeout()) {
@@ -1450,7 +1437,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
14501437
}
14511438
}
14521439

1453-
void FillComputeNodeStatus(TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context, bool reportTimeDifference) {
1440+
void FillComputeNodeStatus(TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
14541441
FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node());
14551442

14561443
TSelfCheckContext rrContext(&context, "NODE_UPTIME");
@@ -1502,17 +1489,15 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
15021489
status = Ydb::Monitoring::StatusFlag::GREEN;
15031490
}
15041491

1505-
computeNodeStatus.mutable_max_time_difference()->set_peer(ToString(peerId));
1506-
computeNodeStatus.mutable_max_time_difference()->set_difference_ms(timeDifferenceDuration.MilliSeconds());
1507-
computeNodeStatus.set_overall(status);
1508-
1509-
if (reportTimeDifference) {
1492+
if (databaseState.MaxTimeDifferenceNodeId == nodeId) {
15101493
TSelfCheckContext tdContext(&context, "NODES_TIME_DIFFERENCE");
1511-
FillNodeInfo(peerId, tdContext.Location.mutable_compute()->mutable_peer());
15121494
if (status == Ydb::Monitoring::StatusFlag::GREEN) {
15131495
tdContext.ReportStatus(status);
15141496
} else {
1515-
tdContext.ReportStatus(status, TStringBuilder() << "The nodes have a time difference of " << timeDifferenceDuration.MilliSeconds() << " ms", ETags::SyncState);
1497+
tdContext.ReportStatus(status, TStringBuilder() << "Node is "
1498+
<< timeDifferenceDuration.MilliSeconds() << " ms "
1499+
<< (timeDifferenceUs > 0 ? "behind " : "ahead of ")
1500+
<< "peer [" << peerId << "]", ETags::SyncState);
15161501
}
15171502
}
15181503
}
@@ -1580,21 +1565,20 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
15801565
if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) {
15811566
context.ReportStatus(systemStatus, "Compute has issues with system tablets", ETags::ComputeState, {ETags::SystemTabletState});
15821567
}
1583-
long maxClockSkewUs = 0;
1584-
TNodeId maxClockSkewNodeId = 0;
1568+
long maxTimeDifferenceUs = 0;
15851569
for (TNodeId nodeId : *computeNodeIds) {
15861570
auto itNodeSystemState = MergedNodeSystemState.find(nodeId);
15871571
if (itNodeSystemState != MergedNodeSystemState.end()) {
15881572
if (std::count(computeNodeIds->begin(), computeNodeIds->end(), itNodeSystemState->second->GetMaxClockSkewPeerId()) > 0
1589-
&& abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) {
1590-
maxClockSkewUs = abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs());
1591-
maxClockSkewNodeId = nodeId;
1573+
&& abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()) > maxTimeDifferenceUs) {
1574+
maxTimeDifferenceUs = abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs());
1575+
databaseState.MaxTimeDifferenceNodeId = nodeId;
15921576
}
15931577
}
15941578
}
15951579
for (TNodeId nodeId : *computeNodeIds) {
15961580
auto& computeNode = *computeStatus.add_nodes();
1597-
FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"}, maxClockSkewNodeId == nodeId);
1581+
FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"});
15981582
}
15991583
FillComputeDatabaseStatus(databaseState, computeStatus, {&context, "COMPUTE_QUOTA"});
16001584
context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::ComputeState, {ETags::Uptime});

ydb/public/api/protos/ydb_monitoring.proto

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -106,19 +106,12 @@ message LoadAverageStatus {
106106
uint32 cores = 3;
107107
}
108108

109-
message TimeDifferenceStatus {
110-
StatusFlag.Status overall = 1;
111-
int64 difference_ms = 2;
112-
string peer = 3;
113-
}
114-
115109
message ComputeNodeStatus {
116110
string id = 1;
117111
StatusFlag.Status overall = 2;
118112
repeated ComputeTabletStatus tablets = 3;
119113
repeated ThreadPoolStatus pools = 4;
120114
LoadAverageStatus load = 5;
121-
TimeDifferenceStatus max_time_difference = 6;
122115
}
123116

124117
message ComputeStatus {
@@ -174,7 +167,6 @@ message LocationCompute {
174167
LocationNode node = 1;
175168
LocationComputePool pool = 2;
176169
LocationComputeTablet tablet = 3;
177-
LocationNode peer = 4;
178170
}
179171

180172
message LocationDatabase {

0 commit comments

Comments
 (0)