Skip to content

Commit 7def863

Browse files
Merge 01ba776 into 86255fd
2 parents 86255fd + 01ba776 commit 7def863

File tree

2 files changed

+150
-34
lines changed

2 files changed

+150
-34
lines changed

ydb/core/health_check/health_check.cpp

Lines changed: 90 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -783,6 +783,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
783783
}
784784

785785
void Bootstrap() {
786+
Cerr << "iiiiii Bootstrap " << SelfId() << Endl;
786787
FilterDatabase = Request->Database;
787788
if (Request->Request.operation_params().has_operation_timeout()) {
788789
Timeout = GetDuration(Request->Request.operation_params().operation_timeout());
@@ -837,6 +838,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
837838
}
838839

839840
void Handle(TEvNodeWardenStorageConfig::TPtr ev) {
841+
Cerr << "aaaaa TEvNodeWardenStorageConfig" << Endl;
840842
NodeWardenStorageConfig->Set(std::move(ev));
841843
if (const NKikimrBlobStorage::TStorageConfig& config = *NodeWardenStorageConfig->Get()->Config; config.HasBlobStorageConfig()) {
842844
if (const auto& bsConfig = config.GetBlobStorageConfig(); bsConfig.HasServiceSet()) {
@@ -868,6 +870,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
868870
}
869871

870872
auto groupId = vDisk.GetVDiskID().GetGroupID();
873+
Cerr << "aaaaa TEvNodeWardenStorageConfig 2" << Endl;
871874
if (NeedWhiteboardInfoForGroup(groupId)) {
872875
BLOG_D("Requesting whiteboard for group " << groupId);
873876
RequestStorageNode(vDisk.GetVDiskLocation().GetNodeID());
@@ -1115,6 +1118,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
11151118
}
11161119

11171120
void RequestStorageNode(TNodeId nodeId) {
1121+
Cerr << "aaaaaaa RequestStorageNode " << nodeId << Endl;
11181122
if (StorageNodeIds.emplace(nodeId).second) {
11191123
RequestGenericNode(nodeId);
11201124
if (NodeVDiskState.count(nodeId) == 0) {
@@ -1155,24 +1159,34 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
11551159
}
11561160

11571161
void Handle(TEvPrivate::TEvRetryNodeWhiteboard::TPtr& ev) {
1162+
Cerr << "!!!!!!! Handle RetryNodeWhiteboard " << Endl;
11581163
auto eventId = ev->Get()->EventId;
11591164
auto nodeId = ev->Get()->NodeId;
11601165
switch (eventId) {
11611166
case TEvWhiteboard::EvSystemStateRequest:
1162-
NodeSystemState.erase(nodeId);
1163-
NodeSystemState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvSystemStateRequest>(nodeId);
1167+
Cerr << "!!!!!!! Handle RetryNodeWhiteboard EvSystemStateRequest " << Endl;
1168+
// if (!NodeSystemState[nodeId].IsDone()) {
1169+
NodeSystemState.erase(nodeId);
1170+
NodeSystemState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvSystemStateRequest>(nodeId);
1171+
// }
11641172
break;
11651173
case TEvWhiteboard::EvVDiskStateRequest:
1166-
NodeVDiskState.erase(nodeId);
1167-
NodeVDiskState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvVDiskStateRequest>(nodeId);
1174+
if (!NodeVDiskState[nodeId].IsDone()) {
1175+
NodeVDiskState.erase(nodeId);
1176+
NodeVDiskState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvVDiskStateRequest>(nodeId);
1177+
}
11681178
break;
11691179
case TEvWhiteboard::EvPDiskStateRequest:
1170-
NodePDiskState.erase(nodeId);
1171-
NodePDiskState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvPDiskStateRequest>(nodeId);
1180+
if (!NodePDiskState[nodeId].IsDone()) {
1181+
NodePDiskState.erase(nodeId);
1182+
NodePDiskState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvPDiskStateRequest>(nodeId);
1183+
}
11721184
break;
11731185
case TEvWhiteboard::EvBSGroupStateRequest:
1174-
NodeBSGroupState.erase(nodeId);
1175-
NodeBSGroupState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvBSGroupStateRequest>(nodeId);
1186+
if (!NodeBSGroupState[nodeId].IsDone()) {
1187+
NodeBSGroupState.erase(nodeId);
1188+
NodeBSGroupState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvBSGroupStateRequest>(nodeId);
1189+
}
11761190
break;
11771191
default:
11781192
RequestDone("unsupported event scheduled");
@@ -1182,6 +1196,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
11821196

11831197
template<typename TEvent>
11841198
bool RetryRequestNodeWhiteboard(TNodeId nodeId) {
1199+
Cerr << "!!!!!!! RetryRequestNodeWhiteboard " << nodeId << Endl;
11851200
if (NodeRetries[{nodeId, TEvent::EventType}]++ < MaxRetries) {
11861201
Schedule(RetryDelay, new TEvPrivate::TEvRetryNodeWhiteboard(nodeId, TEvent::EventType));
11871202
return true;
@@ -1190,6 +1205,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
11901205
}
11911206

11921207
void Handle(TEvents::TEvUndelivered::TPtr& ev) {
1208+
Cerr << "iiiiiiiii Undelivered " << Endl;
11931209
ui32 nodeId = ev.Get()->Cookie;
11941210
TString error = "Undelivered";
11951211
if (ev->Get()->SourceType == TEvWhiteboard::EvSystemStateRequest) {
@@ -1226,6 +1242,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
12261242
}
12271243

12281244
void Disconnected(TEvInterconnect::TEvNodeDisconnected::TPtr& ev) {
1245+
Cerr << "iiiiiiiii Disconnected " << Endl;
12291246
ui32 nodeId = ev->Get()->NodeId;
12301247
TString error = "NodeDisconnected";
12311248
if (NodeSystemState.count(nodeId) && NodeSystemState[nodeId].Error(error)) {
@@ -1310,8 +1327,10 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
13101327
}
13111328

13121329
void HandleTimeout(TEvents::TEvWakeup::TPtr& ev) {
1330+
Cerr << "aaaaa HandleTimeout" << Endl;
13131331
switch (ev->Get()->Tag) {
13141332
case TimeoutBSC:
1333+
Cerr << "aaaaa TimeoutBSC" << Endl;
13151334
Span.Event("TimeoutBSC");
13161335
if (!HaveAllBSControllerInfo()) {
13171336
if (FilterDatabase.empty() || FilterDatabase == DomainPath) {
@@ -1632,14 +1651,32 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
16321651
RequestDone("TEvListTenantsResponse");
16331652
}
16341653

1654+
// void Handle(TEvWhiteboard::TEvSystemStateResponse::TPtr& ev) {
1655+
// TNodeId nodeId = ev.Get()->Cookie;
1656+
// Cerr << "iiiiiiii TEvSystemStateResponse: nodeId: " << nodeId << Endl;
1657+
// if (!NodeSystemState[nodeId].Done()) {
1658+
// auto& nodeSystemState(NodeSystemState[nodeId]);
1659+
// nodeSystemState.Set(std::move(ev));
1660+
// for (NKikimrWhiteboard::TSystemStateInfo& state : *nodeSystemState->Record.MutableSystemStateInfo()) {
1661+
// state.set_nodeid(nodeId);
1662+
// MergedNodeSystemState[nodeId] = &state;
1663+
// }
1664+
// }
1665+
// RequestDone("TEvSystemStateResponse");
1666+
// }
1667+
16351668
void Handle(TEvWhiteboard::TEvSystemStateResponse::TPtr& ev) {
16361669
TNodeId nodeId = ev.Get()->Cookie;
1670+
Cerr << "iiiiiiii TEvSystemStateResponse: nodeId: " << nodeId << Endl;
1671+
16371672
auto& nodeSystemState(NodeSystemState[nodeId]);
16381673
nodeSystemState.Set(std::move(ev));
16391674
for (NKikimrWhiteboard::TSystemStateInfo& state : *nodeSystemState->Record.MutableSystemStateInfo()) {
1675+
Cerr << "iiiiiiii Fill " << Endl;
16401676
state.set_nodeid(nodeId);
16411677
MergedNodeSystemState[nodeId] = &state;
16421678
}
1679+
16431680
RequestDone("TEvSystemStateResponse");
16441681
}
16451682

@@ -1891,10 +1928,12 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
18911928
rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
18921929
}
18931930

1931+
Cerr << "iiiiiiii FillComputeNodeStatus: nodeId: " << nodeId << Endl;
18941932
auto itNodeSystemState = MergedNodeSystemState.find(nodeId);
18951933
if (itNodeSystemState != MergedNodeSystemState.end()) {
18961934
const NKikimrWhiteboard::TSystemStateInfo& nodeSystemState(*itNodeSystemState->second);
18971935

1936+
Cerr << "iiiiiiii poolstats: " << nodeSystemState.poolstats_size() << Endl;
18981937
for (const auto& poolStat : nodeSystemState.poolstats()) {
18991938
TSelfCheckContext poolContext(&context, "COMPUTE_POOL");
19001939
poolContext.Location.mutable_compute()->mutable_pool()->set_name(poolStat.name());
@@ -1943,6 +1982,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
19431982
}
19441983
}
19451984
}
1985+
Cerr << "iiiiiiii nodeSystemState: 2 " << Endl;
19461986
} else {
19471987
// context.ReportStatus(Ydb::Monitoring::StatusFlag::RED,
19481988
// TStringBuilder() << "Compute node is not available",
@@ -2306,45 +2346,61 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
23062346

23072347
void Handle(TEvWhiteboard::TEvVDiskStateResponse::TPtr& ev) {
23082348
TNodeId nodeId = ev.Get()->Cookie;
2309-
auto& nodeVDiskState(NodeVDiskState[nodeId]);
2310-
nodeVDiskState.Set(std::move(ev));
2311-
for (NKikimrWhiteboard::TVDiskStateInfo& state : *nodeVDiskState->Record.MutableVDiskStateInfo()) {
2312-
state.set_nodeid(nodeId);
2313-
auto id = GetVDiskId(state.vdiskid());
2314-
MergedVDiskState[id] = &state;
2349+
if (!NodeVDiskState.count(nodeId)) {
2350+
auto& nodeVDiskState(NodeVDiskState[nodeId]);
2351+
nodeVDiskState.Set(std::move(ev));
2352+
for (NKikimrWhiteboard::TVDiskStateInfo& state : *nodeVDiskState->Record.MutableVDiskStateInfo()) {
2353+
state.set_nodeid(nodeId);
2354+
auto id = GetVDiskId(state.vdiskid());
2355+
MergedVDiskState[id] = &state;
2356+
}
2357+
}
2358+
2359+
TString error = "NodeDisconnected";
2360+
if (NodeSystemState.count(nodeId) && NodeSystemState[nodeId].Error(error)) {
2361+
if (!RetryRequestNodeWhiteboard<TEvWhiteboard::TEvSystemStateRequest>(nodeId)) {
2362+
Cerr << "iiiiiii Retry" << Endl;
2363+
RequestDone("node disconnected with TEvSystemStateRequest");
2364+
UnavailableComputeNodes.insert(nodeId);
2365+
}
23152366
}
2367+
23162368
RequestDone("TEvVDiskStateResponse");
23172369
}
23182370

23192371
void Handle(TEvWhiteboard::TEvPDiskStateResponse::TPtr& ev) {
23202372
TNodeId nodeId = ev.Get()->Cookie;
2321-
auto& nodePDiskState(NodePDiskState[nodeId]);
2322-
nodePDiskState.Set(std::move(ev));
2323-
for (NKikimrWhiteboard::TPDiskStateInfo& state : *nodePDiskState->Record.MutablePDiskStateInfo()) {
2324-
state.set_nodeid(nodeId);
2325-
auto id = GetPDiskId(state);
2326-
MergedPDiskState[id] = &state;
2373+
if (!NodePDiskState.count(nodeId)) {
2374+
auto& nodePDiskState(NodePDiskState[nodeId]);
2375+
nodePDiskState.Set(std::move(ev));
2376+
for (NKikimrWhiteboard::TPDiskStateInfo& state : *nodePDiskState->Record.MutablePDiskStateInfo()) {
2377+
state.set_nodeid(nodeId);
2378+
auto id = GetPDiskId(state);
2379+
MergedPDiskState[id] = &state;
2380+
}
23272381
}
23282382
RequestDone("TEvPDiskStateResponse");
23292383
}
23302384

23312385
void Handle(TEvWhiteboard::TEvBSGroupStateResponse::TPtr& ev) {
23322386
ui64 nodeId = ev.Get()->Cookie;
2333-
auto& nodeBSGroupState(NodeBSGroupState[nodeId]);
2334-
nodeBSGroupState.Set(std::move(ev));
2335-
for (NKikimrWhiteboard::TBSGroupStateInfo& state : *nodeBSGroupState->Record.MutableBSGroupStateInfo()) {
2336-
state.set_nodeid(nodeId);
2337-
TString storagePoolName = state.storagepoolname();
2338-
TGroupID groupId(state.groupid());
2339-
const NKikimrWhiteboard::TBSGroupStateInfo*& current(MergedBSGroupState[state.groupid()]);
2340-
if (current == nullptr || current->GetGroupGeneration() < state.GetGroupGeneration()) {
2341-
current = &state;
2342-
}
2343-
if (storagePoolName.empty() && groupId.ConfigurationType() != EGroupConfigurationType::Static) {
2344-
continue;
2387+
if (!NodeBSGroupState.count(nodeId)) {
2388+
auto& nodeBSGroupState(NodeBSGroupState[nodeId]);
2389+
nodeBSGroupState.Set(std::move(ev));
2390+
for (NKikimrWhiteboard::TBSGroupStateInfo& state : *nodeBSGroupState->Record.MutableBSGroupStateInfo()) {
2391+
state.set_nodeid(nodeId);
2392+
TString storagePoolName = state.storagepoolname();
2393+
TGroupID groupId(state.groupid());
2394+
const NKikimrWhiteboard::TBSGroupStateInfo*& current(MergedBSGroupState[state.groupid()]);
2395+
if (current == nullptr || current->GetGroupGeneration() < state.GetGroupGeneration()) {
2396+
current = &state;
2397+
}
2398+
if (storagePoolName.empty() && groupId.ConfigurationType() != EGroupConfigurationType::Static) {
2399+
continue;
2400+
}
2401+
StoragePoolStateByName[storagePoolName].Groups.emplace(state.groupid());
2402+
StoragePoolStateByName[storagePoolName].Name = storagePoolName;
23452403
}
2346-
StoragePoolStateByName[storagePoolName].Groups.emplace(state.groupid());
2347-
StoragePoolStateByName[storagePoolName].Name = storagePoolName;
23482404
}
23492405
RequestDone("TEvBSGroupStateResponse");
23502406
}

ydb/core/health_check/health_check_ut.cpp

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2353,5 +2353,65 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
23532353
}
23542354
UNIT_ASSERT(pdiskIssueFoundInResult);
23552355
}
2356+
2357+
Y_UNIT_TEST(TestWhiteboardResponseOnSameNode) {
2358+
TPortManager tp;
2359+
ui16 port = tp.GetPort(2134);
2360+
ui16 grpcPort = tp.GetPort(2135);
2361+
auto settings = TServerSettings(port)
2362+
.SetNodeCount(1)
2363+
.SetDynamicNodeCount(1)
2364+
.SetUseRealThreads(false)
2365+
.SetDomainName("Root");
2366+
TServer server(settings);
2367+
server.EnableGRpc(grpcPort);
2368+
TClient client(settings);
2369+
TTestActorRuntime& runtime = *server.GetRuntime();
2370+
2371+
TActorId sender = runtime.AllocateEdgeActor();
2372+
TAutoPtr<IEventHandle> handle;
2373+
2374+
std::optional<TNodeId> nodeId;
2375+
auto observerFunc = [&](TAutoPtr<IEventHandle>& ev) {
2376+
switch (ev->GetTypeRewrite()) {
2377+
case TEvWhiteboard::EvVDiskStateResponse: {
2378+
auto* msg = ev->Release<TEvWhiteboard::TEvVDiskStateResponse>().Release();
2379+
msg->Record.ClearVDiskStateInfo(); // whiteboard doesn't have any update
2380+
ev.Reset(new IEventHandle(ev->Recipient, ev->Sender, msg, ev->Flags, *nodeId));
2381+
break;
2382+
}
2383+
case TEvWhiteboard::EvSystemStateResponse: {
2384+
if (!nodeId) {
2385+
nodeId = ev->Cookie;
2386+
} else {
2387+
auto* msg = ev->Release<TEvWhiteboard::TEvSystemStateResponse>().Release();
2388+
msg->Record.ClearSystemStateInfo(); // whiteboard doesn't have any update
2389+
ev.Reset(new IEventHandle(ev->Recipient, ev->Sender, msg, ev->Flags, *nodeId));
2390+
}
2391+
break;
2392+
}
2393+
}
2394+
2395+
return TTestActorRuntime::EEventAction::PROCESS;
2396+
};
2397+
runtime.SetObserverFunc(observerFunc);
2398+
2399+
2400+
2401+
if (!delayed) {
2402+
TDispatchOptions opts;
2403+
opts.FinalEvents.emplace_back([&delayed](IEventHandle&) {
2404+
return bool(delayed);
2405+
});
2406+
server->GetRuntime()->DispatchEvents(opts);
2407+
}
2408+
2409+
runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
2410+
Cerr << "iiiiii try 1" << Endl;
2411+
runtime.GrabEdgeEvent<TEvWhiteboard::TEvSystemStateResponse>(handle);
2412+
Cerr << "iiiiii try 2" << Endl;
2413+
auto result = runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
2414+
Cerr << result.ShortDebugString();
2415+
}
23562416
}
23572417
}

0 commit comments

Comments
 (0)