Skip to content

Commit 4a8dbd3

Browse files
Merge 34f2be1 into 736a48a
2 parents 736a48a + 34f2be1 commit 4a8dbd3

File tree

4 files changed

+57
-42
lines changed

4 files changed

+57
-42
lines changed

ydb/core/driver_lib/run/kikimr_services_initializers.cpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -712,8 +712,10 @@ void TBasicServicesInitializer::InitializeServices(NActors::TActorSystemSetup* s
712712
data.Yellow ? NKikimrWhiteboard::EFlag::Yellow :
713713
data.Orange ? NKikimrWhiteboard::EFlag::Orange :
714714
data.Red ? NKikimrWhiteboard::EFlag::Red : NKikimrWhiteboard::EFlag()));
715-
data.ActorSystem->Send(whiteboardId, new NNodeWhiteboard::TEvWhiteboard::TEvClockSkewUpdate(
716-
data.PeerId, data.ClockSkew));
715+
if (data.ReportClockSkew) {
716+
data.ActorSystem->Send(whiteboardId, new NNodeWhiteboard::TEvWhiteboard::TEvClockSkewUpdate(
717+
data.PeerId, data.ClockSkew));
718+
}
717719
};
718720
}
719721

ydb/core/health_check/health_check.cpp

+43-38
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
235235
ui64 StorageQuota;
236236
ui64 StorageUsage;
237237
TMaybeServerlessComputeResourcesMode ServerlessComputeResourcesMode;
238+
TNodeId MaxTimeDifferenceNodeId = 0;
238239
};
239240

240241
struct TSelfCheckResult {
@@ -813,10 +814,10 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
813814
ReplyAndPassAway();
814815
}
815816

816-
bool IsStaticNode(const TEvInterconnect::TNodeInfo& nodeInfo) const {
817+
bool IsStaticNode(const TNodeId nodeId) const {
817818
TAppData* appData = AppData();
818819
if (appData->DynamicNameserviceConfig) {
819-
return nodeInfo.NodeId <= AppData()->DynamicNameserviceConfig->MaxStaticNodeId;
820+
return nodeId <= AppData()->DynamicNameserviceConfig->MaxStaticNodeId;
820821
} else {
821822
return true;
822823
}
@@ -827,7 +828,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
827828
NodesInfo = ev->Release();
828829
for (const auto& ni : NodesInfo->Nodes) {
829830
MergedNodeInfo[ni.NodeId] = &ni;
830-
if (IsStaticNode(ni) && needComputeFromStaticNodes) {
831+
if (IsStaticNode(ni.NodeId) && needComputeFromStaticNodes) {
831832
DatabaseState[DomainPath].ComputeNodeIds.push_back(ni.NodeId);
832833
RequestComputeNode(ni.NodeId);
833834
}
@@ -1251,7 +1252,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
12511252
}
12521253
}
12531254

1254-
void FillComputeNodeStatus(TDatabaseState& databaseState,TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
1255+
void FillComputeNodeStatus(TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
12551256
FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node());
12561257

12571258
TSelfCheckContext rrContext(&context, "NODE_UPTIME");
@@ -1289,6 +1290,32 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
12891290
}
12901291
loadAverageStatus.set_overall(laContext.GetOverallStatus());
12911292
}
1293+
1294+
if (nodeSystemState.HasMaxClockSkewPeerId()) {
1295+
TNodeId peerId = nodeSystemState.GetMaxClockSkewPeerId();
1296+
long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs();
1297+
TDuration timeDifferenceDuration = TDuration::MicroSeconds(abs(timeDifferenceUs));
1298+
Ydb::Monitoring::StatusFlag::Status status;
1299+
if (timeDifferenceDuration > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
1300+
status = Ydb::Monitoring::StatusFlag::ORANGE;
1301+
} else if (timeDifferenceDuration > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
1302+
status = Ydb::Monitoring::StatusFlag::YELLOW;
1303+
} else {
1304+
status = Ydb::Monitoring::StatusFlag::GREEN;
1305+
}
1306+
1307+
if (databaseState.MaxTimeDifferenceNodeId == nodeId) {
1308+
TSelfCheckContext tdContext(&context, "NODES_TIME_DIFFERENCE");
1309+
if (status == Ydb::Monitoring::StatusFlag::GREEN) {
1310+
tdContext.ReportStatus(status);
1311+
} else {
1312+
tdContext.ReportStatus(status, TStringBuilder() << "Node is "
1313+
<< timeDifferenceDuration.MilliSeconds() << " ms "
1314+
<< (timeDifferenceUs > 0 ? "behind " : "ahead of ")
1315+
<< "peer [" << peerId << "]", ETags::SyncState);
1316+
}
1317+
}
1318+
}
12921319
} else {
12931320
// context.ReportStatus(Ydb::Monitoring::StatusFlag::RED,
12941321
// TStringBuilder() << "Compute node is not available",
@@ -1320,12 +1347,24 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
13201347
if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) {
13211348
context.ReportStatus(systemStatus, "Compute has issues with system tablets", ETags::ComputeState, {ETags::SystemTabletState});
13221349
}
1350+
long maxTimeDifferenceUs = 0;
1351+
for (TNodeId nodeId : *computeNodeIds) {
1352+
auto itNodeSystemState = MergedNodeSystemState.find(nodeId);
1353+
if (itNodeSystemState != MergedNodeSystemState.end()) {
1354+
if (std::count(computeNodeIds->begin(), computeNodeIds->end(), itNodeSystemState->second->GetMaxClockSkewPeerId()) > 0
1355+
&& abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()) > maxTimeDifferenceUs) {
1356+
maxTimeDifferenceUs = abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs());
1357+
databaseState.MaxTimeDifferenceNodeId = nodeId;
1358+
}
1359+
}
1360+
}
13231361
for (TNodeId nodeId : *computeNodeIds) {
13241362
auto& computeNode = *computeStatus.add_nodes();
13251363
FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"});
13261364
}
13271365
context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::ComputeState, {ETags::Uptime});
13281366
context.ReportWithMaxChildStatus("Compute is overloaded", ETags::ComputeState, {ETags::OverloadState});
1367+
context.ReportWithMaxChildStatus("Database has time difference between nodes", ETags::ComputeState, {ETags::SyncState});
13291368
Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN;
13301369
computeNodeIds->push_back(0); // for tablets without node
13311370
for (TNodeId nodeId : *computeNodeIds) {
@@ -2072,39 +2111,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
20722111
const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25000);
20732112
const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5000);
20742113

2075-
void FillNodesSyncStatus(TOverallStateContext& context) {
2076-
long maxClockSkewUs = 0;
2077-
TNodeId maxClockSkewPeerId = 0;
2078-
TNodeId maxClockSkewNodeId = 0;
2079-
for (auto& [nodeId, nodeSystemState] : MergedNodeSystemState) {
2080-
if (abs(nodeSystemState->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) {
2081-
maxClockSkewUs = abs(nodeSystemState->GetMaxClockSkewWithPeerUs());
2082-
maxClockSkewPeerId = nodeSystemState->GetMaxClockSkewPeerId();
2083-
maxClockSkewNodeId = nodeId;
2084-
}
2085-
}
2086-
if (!maxClockSkewNodeId) {
2087-
return;
2088-
}
2089-
2090-
TSelfCheckResult syncContext;
2091-
syncContext.Type = "NODES_TIME_DIFFERENCE";
2092-
FillNodeInfo(maxClockSkewNodeId, syncContext.Location.mutable_node());
2093-
FillNodeInfo(maxClockSkewPeerId, syncContext.Location.mutable_peer());
2094-
2095-
TDuration maxClockSkewTime = TDuration::MicroSeconds(maxClockSkewUs);
2096-
if (maxClockSkewTime > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
2097-
syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState);
2098-
} else if (maxClockSkewTime > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
2099-
syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState);
2100-
} else {
2101-
syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
2102-
}
2103-
2104-
context.UpdateMaxStatus(syncContext.GetOverallStatus());
2105-
context.AddIssues(syncContext.IssueRecords);
2106-
}
2107-
21082114
void FillResult(TOverallStateContext context) {
21092115
if (IsSpecificDatabaseFilter()) {
21102116
FillDatabaseResult(context, FilterDatabase, DatabaseState[FilterDatabase]);
@@ -2113,7 +2119,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
21132119
FillDatabaseResult(context, path, state);
21142120
}
21152121
}
2116-
FillNodesSyncStatus(context);
21172122
if (DatabaseState.empty()) {
21182123
Ydb::Monitoring::DatabaseStatus& databaseStatus(*context.Result->add_database_status());
21192124
TSelfCheckResult tabletContext;

ydb/library/actors/interconnect/interconnect_common.h

+4-1
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,10 @@ namespace NActors {
7474
bool Orange;
7575
bool Red;
7676
i64 ClockSkew;
77+
bool ReportClockSkew;
7778

78-
TWhiteboardSessionStatus(TActorSystem* actorSystem, ui32 peerId, const TString& peer, bool connected, bool green, bool yellow, bool orange, bool red, i64 clockSkew)
79+
TWhiteboardSessionStatus(TActorSystem* actorSystem, ui32 peerId, const TString& peer, bool connected,
80+
bool green, bool yellow, bool orange, bool red, i64 clockSkew, bool reportClockSkew)
7981
: ActorSystem(actorSystem)
8082
, PeerId(peerId)
8183
, Peer(peer)
@@ -85,6 +87,7 @@ namespace NActors {
8587
, Orange(orange)
8688
, Red(red)
8789
, ClockSkew(clockSkew)
90+
, ReportClockSkew(reportClockSkew)
8891
{}
8992
};
9093

ydb/library/actors/interconnect/interconnect_tcp_session.cpp

+6-1
Original file line numberDiff line numberDiff line change
@@ -999,6 +999,10 @@ namespace NActors {
999999
} while (false);
10001000
}
10011001

1002+
// we need track clockskew only if it's one tenant nodes connection
1003+
// they have one scope in this case
1004+
bool reportClockSkew = Proxy->Common->LocalScopeId.first != 0 && Proxy->Common->LocalScopeId == Params.PeerScopeId;
1005+
10021006
callback({TlsActivationContext->ExecutorThread.ActorSystem,
10031007
Proxy->PeerNodeId,
10041008
Proxy->Metrics->GetHumanFriendlyPeerHostName(),
@@ -1007,7 +1011,8 @@ namespace NActors {
10071011
flagState == EFlag::YELLOW,
10081012
flagState == EFlag::ORANGE,
10091013
flagState == EFlag::RED,
1010-
ReceiveContext->ClockSkew_us.load()});
1014+
ReceiveContext->ClockSkew_us.load(),
1015+
reportClockSkew});
10111016
}
10121017

10131018
if (connected) {

0 commit comments

Comments
 (0)