Skip to content

Commit 27e55a7

Browse files
move time_difference to compute_node
1 parent 0dc0c27 commit 27e55a7

File tree

2 files changed

+54
-68
lines changed

2 files changed

+54
-68
lines changed

ydb/core/health_check/health_check.cpp

Lines changed: 46 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1450,7 +1450,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
14501450
}
14511451
}
14521452

1453-
void FillComputeNodeStatus(TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
1453+
void FillComputeNodeStatus(TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context, bool reportTimeDifference) {
14541454
FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node());
14551455

14561456
TSelfCheckContext rrContext(&context, "NODE_UPTIME");
@@ -1488,6 +1488,34 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
14881488
}
14891489
loadAverageStatus.set_overall(laContext.GetOverallStatus());
14901490
}
1491+
1492+
{
1493+
TNodeId peerId = nodeSystemState.GetMaxClockSkewPeerId();
1494+
long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs();
1495+
TDuration timeDifferenceDuration = TDuration::MicroSeconds(abs(timeDifferenceUs));
1496+
Ydb::Monitoring::StatusFlag::Status status;
1497+
if (timeDifferenceDuration > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
1498+
status = Ydb::Monitoring::StatusFlag::ORANGE;
1499+
} else if (timeDifferenceDuration > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
1500+
status = Ydb::Monitoring::StatusFlag::YELLOW;
1501+
} else {
1502+
status = Ydb::Monitoring::StatusFlag::GREEN;
1503+
}
1504+
1505+
computeNodeStatus.mutable_max_time_difference()->set_peer(ToString(peerId));
1506+
computeNodeStatus.mutable_max_time_difference()->set_difference_ms(timeDifferenceDuration.MilliSeconds());
1507+
computeNodeStatus.set_overall(status);
1508+
1509+
if (reportTimeDifference) {
1510+
TSelfCheckContext tdContext(&context, "NODES_TIME_DIFFERENCE");
1511+
FillNodeInfo(peerId, tdContext.Location.mutable_compute()->mutable_peer());
1512+
if (status == Ydb::Monitoring::StatusFlag::GREEN) {
1513+
tdContext.ReportStatus(status);
1514+
} else {
1515+
tdContext.ReportStatus(status, TStringBuilder() << "The nodes have a time difference of " << timeDifferenceDuration.MilliSeconds() << " ms", ETags::SyncState);
1516+
}
1517+
}
1518+
}
14911519
} else {
14921520
// context.ReportStatus(Ydb::Monitoring::StatusFlag::RED,
14931521
// TStringBuilder() << "Compute node is not available",
@@ -1552,14 +1580,27 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
15521580
if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) {
15531581
context.ReportStatus(systemStatus, "Compute has issues with system tablets", ETags::ComputeState, {ETags::SystemTabletState});
15541582
}
1583+
long maxClockSkewUs = 0;
1584+
TNodeId maxClockSkewNodeId = 0;
1585+
for (TNodeId nodeId : *computeNodeIds) {
1586+
auto itNodeSystemState = MergedNodeSystemState.find(nodeId);
1587+
if (itNodeSystemState != MergedNodeSystemState.end()) {
1588+
if (std::count(computeNodeIds->begin(), computeNodeIds->end(), itNodeSystemState->second->GetMaxClockSkewPeerId()) > 0
1589+
&& abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) {
1590+
maxClockSkewUs = abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs());
1591+
maxClockSkewNodeId = nodeId;
1592+
}
1593+
}
1594+
}
15551595
for (TNodeId nodeId : *computeNodeIds) {
15561596
auto& computeNode = *computeStatus.add_nodes();
1557-
FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"});
1597+
FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"}, maxClockSkewNodeId == nodeId);
15581598
}
15591599
FillComputeDatabaseStatus(databaseState, computeStatus, {&context, "COMPUTE_QUOTA"});
15601600
context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::ComputeState, {ETags::Uptime});
15611601
context.ReportWithMaxChildStatus("Compute is overloaded", ETags::ComputeState, {ETags::OverloadState});
15621602
context.ReportWithMaxChildStatus("Compute quota usage", ETags::ComputeState, {ETags::QuotaUsage});
1603+
context.ReportWithMaxChildStatus("Database has time difference between nodes", ETags::ComputeState, {ETags::SyncState});
15631604
Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN;
15641605
computeNodeIds->push_back(0); // for tablets without node
15651606
for (TNodeId nodeId : *computeNodeIds) {
@@ -2579,17 +2620,14 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
25792620
databaseStatus.set_name(path);
25802621
FillCompute(state, *databaseStatus.mutable_compute(), {&dbContext, "COMPUTE"});
25812622
FillStorage(state, *databaseStatus.mutable_storage(), {&dbContext, "STORAGE"});
2582-
FillTimeDifference(state, *databaseStatus.mutable_time_difference(), {&dbContext, "NODES_TIME_DIFFERENCE"});
25832623
if (databaseStatus.compute().overall() != Ydb::Monitoring::StatusFlag::GREEN
25842624
&& databaseStatus.storage().overall() != Ydb::Monitoring::StatusFlag::GREEN) {
25852625
dbContext.ReportStatus(MaxStatus(databaseStatus.compute().overall(), databaseStatus.storage().overall()),
2586-
"Database has multiple issues", ETags::DBState, { ETags::ComputeState, ETags::StorageState, ETags::SyncState });
2626+
"Database has multiple issues", ETags::DBState, { ETags::ComputeState, ETags::StorageState });
25872627
} else if (databaseStatus.compute().overall() != Ydb::Monitoring::StatusFlag::GREEN) {
2588-
dbContext.ReportStatus(databaseStatus.compute().overall(), "Database has compute issues", ETags::DBState, {ETags::ComputeState, ETags::SyncState});
2628+
dbContext.ReportStatus(databaseStatus.compute().overall(), "Database has compute issues", ETags::DBState, {ETags::ComputeState});
25892629
} else if (databaseStatus.storage().overall() != Ydb::Monitoring::StatusFlag::GREEN) {
2590-
dbContext.ReportStatus(databaseStatus.storage().overall(), "Database has storage issues", ETags::DBState, {ETags::StorageState, ETags::SyncState});
2591-
} else if (databaseStatus.time_difference().overall() != Ydb::Monitoring::StatusFlag::GREEN) {
2592-
dbContext.ReportStatus(databaseStatus.time_difference().overall(), "Database has time difference issues", ETags::DBState, {ETags::SyncState});
2630+
dbContext.ReportStatus(databaseStatus.storage().overall(), "Database has storage issues", ETags::DBState, {ETags::StorageState});
25932631
}
25942632
databaseStatus.set_overall(dbContext.GetOverallStatus());
25952633
context.UpdateMaxStatus(dbContext.GetOverallStatus());
@@ -2602,58 +2640,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
26022640
const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25000);
26032641
const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5000);
26042642

2605-
void FillTimeDifference(TDatabaseState& databaseState, Ydb::Monitoring::TimeDifferenceStatus& timeDifferenceStatus, TSelfCheckContext context) {
2606-
long maxClockSkewUs = 0;
2607-
TNodeId maxClockSkewPeerId = 0;
2608-
TNodeId maxClockSkewNodeId = 0;
2609-
2610-
TVector<TNodeId>* computeNodeIds = &databaseState.ComputeNodeIds;
2611-
if (databaseState.ResourcePathId
2612-
&& databaseState.ServerlessComputeResourcesMode != NKikimrSubDomains::EServerlessComputeResourcesModeExclusive)
2613-
{
2614-
auto itDatabase = FilterDomainKey.find(TSubDomainKey(databaseState.ResourcePathId.OwnerId, databaseState.ResourcePathId.LocalPathId));
2615-
if (itDatabase != FilterDomainKey.end()) {
2616-
const TString& sharedDatabaseName = itDatabase->second;
2617-
TDatabaseState& sharedDatabase = DatabaseState[sharedDatabaseName];
2618-
computeNodeIds = &sharedDatabase.ComputeNodeIds;
2619-
}
2620-
}
2621-
2622-
for (TNodeId nodeId : *computeNodeIds) {
2623-
auto itNodeSystemState = MergedNodeSystemState.find(nodeId);
2624-
if (itNodeSystemState != MergedNodeSystemState.end()) {
2625-
if (IsTimeDifferenceCheckNode(nodeId) && IsTimeDifferenceCheckNode(itNodeSystemState->second->GetMaxClockSkewPeerId())
2626-
&& abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) {
2627-
maxClockSkewUs = abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs());
2628-
maxClockSkewPeerId = itNodeSystemState->second->GetMaxClockSkewPeerId();
2629-
maxClockSkewNodeId = nodeId;
2630-
}
2631-
}
2632-
}
2633-
2634-
if (!maxClockSkewNodeId) {
2635-
timeDifferenceStatus.set_overall(Ydb::Monitoring::StatusFlag::GREEN);
2636-
return;
2637-
}
2638-
2639-
FillNodeInfo(maxClockSkewNodeId, context.Location.mutable_node());
2640-
FillNodeInfo(maxClockSkewPeerId, context.Location.mutable_peer());
2641-
2642-
TDuration maxClockSkewTime = TDuration::MicroSeconds(maxClockSkewUs);
2643-
if (maxClockSkewTime > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
2644-
context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState);
2645-
} else if (maxClockSkewTime > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
2646-
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState);
2647-
} else {
2648-
context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
2649-
}
2650-
2651-
timeDifferenceStatus.set_node(ToString(maxClockSkewNodeId));
2652-
timeDifferenceStatus.set_peer(ToString(maxClockSkewPeerId));
2653-
timeDifferenceStatus.set_max_difference_ms(maxClockSkewTime.MilliSeconds());
2654-
timeDifferenceStatus.set_overall(context.GetOverallStatus());
2655-
}
2656-
26572643
void FillResult(TOverallStateContext context) {
26582644
if (IsSpecificDatabaseFilter()) {
26592645
FillDatabaseResult(context, FilterDatabase, DatabaseState[FilterDatabase]);

ydb/public/api/protos/ydb_monitoring.proto

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -106,12 +106,19 @@ message LoadAverageStatus {
106106
uint32 cores = 3;
107107
}
108108

109+
message TimeDifferenceStatus {
110+
StatusFlag.Status overall = 1;
111+
int64 difference_ms = 2;
112+
string peer = 3;
113+
}
114+
109115
message ComputeNodeStatus {
110116
string id = 1;
111117
StatusFlag.Status overall = 2;
112118
repeated ComputeTabletStatus tablets = 3;
113119
repeated ThreadPoolStatus pools = 4;
114120
LoadAverageStatus load = 5;
121+
TimeDifferenceStatus max_time_difference = 6;
115122
}
116123

117124
message ComputeStatus {
@@ -122,13 +129,6 @@ message ComputeStatus {
122129
float shards_quota_usage = 5;
123130
}
124131

125-
message TimeDifferenceStatus {
126-
StatusFlag.Status overall = 1;
127-
int64 max_difference_ms = 2;
128-
string node = 3;
129-
string peer = 4;
130-
}
131-
132132
message LocationNode {
133133
uint32 id = 1;
134134
string host = 2;
@@ -174,6 +174,7 @@ message LocationCompute {
174174
LocationNode node = 1;
175175
LocationComputePool pool = 2;
176176
LocationComputeTablet tablet = 3;
177+
LocationNode peer = 4;
177178
}
178179

179180
message LocationDatabase {
@@ -205,7 +206,6 @@ message DatabaseStatus {
205206
StatusFlag.Status overall = 2;
206207
StorageStatus storage = 3;
207208
ComputeStatus compute = 4;
208-
TimeDifferenceStatus time_difference = 5;
209209
}
210210

211211
message SelfCheckResult {

0 commit comments

Comments
 (0)