Skip to content

Commit 24a0986

Browse files
authored
25-1-1: healthcheck segfault while retrying Whiteboard (#17836) (#18905)
2 parents 319908d + 18480f8 commit 24a0986

File tree

2 files changed

+62
-8
lines changed

2 files changed

+62
-8
lines changed

ydb/core/health_check/health_check.cpp

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1102,20 +1102,28 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
11021102
auto nodeId = ev->Get()->NodeId;
11031103
switch (eventId) {
11041104
case TEvWhiteboard::EvSystemStateRequest:
1105-
NodeSystemState.erase(nodeId);
1106-
NodeSystemState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvSystemStateRequest>(nodeId);
1105+
if (!NodeSystemState[nodeId].IsDone()) {
1106+
NodeSystemState.erase(nodeId);
1107+
NodeSystemState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvSystemStateRequest>(nodeId, {-1});
1108+
}
11071109
break;
11081110
case TEvWhiteboard::EvVDiskStateRequest:
1109-
NodeVDiskState.erase(nodeId);
1110-
NodeVDiskState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvVDiskStateRequest>(nodeId);
1111+
if (!NodeVDiskState[nodeId].IsDone()) {
1112+
NodeVDiskState.erase(nodeId);
1113+
NodeVDiskState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvVDiskStateRequest>(nodeId);
1114+
}
11111115
break;
11121116
case TEvWhiteboard::EvPDiskStateRequest:
1113-
NodePDiskState.erase(nodeId);
1114-
NodePDiskState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvPDiskStateRequest>(nodeId);
1117+
if (!NodePDiskState[nodeId].IsDone()) {
1118+
NodePDiskState.erase(nodeId);
1119+
NodePDiskState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvPDiskStateRequest>(nodeId);
1120+
}
11151121
break;
11161122
case TEvWhiteboard::EvBSGroupStateRequest:
1117-
NodeBSGroupState.erase(nodeId);
1118-
NodeBSGroupState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvBSGroupStateRequest>(nodeId);
1123+
if (!NodeBSGroupState[nodeId].IsDone()) {
1124+
NodeBSGroupState.erase(nodeId);
1125+
NodeBSGroupState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvBSGroupStateRequest>(nodeId);
1126+
}
11191127
break;
11201128
default:
11211129
RequestDone("unsupported event scheduled");

ydb/core/health_check/health_check_ut.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2066,5 +2066,51 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
20662066

20672067
UNIT_ASSERT(HasTabletIssue(result));
20682068
}
2069+
2070+
Y_UNIT_TEST(TestSystemStateRetriesAfterReceivingResponse) {
2071+
TPortManager tp;
2072+
ui16 port = tp.GetPort(2134);
2073+
ui16 grpcPort = tp.GetPort(2135);
2074+
auto settings = TServerSettings(port)
2075+
.SetNodeCount(1)
2076+
.SetDynamicNodeCount(1)
2077+
.SetUseRealThreads(false)
2078+
.SetDomainName("Root");
2079+
TServer server(settings);
2080+
server.EnableGRpc(grpcPort);
2081+
TClient client(settings);
2082+
TTestActorRuntime& runtime = *server.GetRuntime();
2083+
2084+
TActorId sender = runtime.AllocateEdgeActor();
2085+
TAutoPtr<IEventHandle> handle;
2086+
2087+
std::optional<TActorId> targetActor;
2088+
auto observerFunc = [&](TAutoPtr<IEventHandle>& ev) {
2089+
switch (ev->GetTypeRewrite()) {
2090+
case TEvWhiteboard::EvSystemStateResponse: {
2091+
if (ev->Cookie == 1) {
2092+
if (!targetActor) {
2093+
targetActor = ev->Recipient;
2094+
runtime.Send(ev.Release());
2095+
runtime.Send(new IEventHandle(
2096+
*targetActor,
2097+
sender,
2098+
new NHealthCheck::TEvPrivate::TEvRetryNodeWhiteboard(1, TEvWhiteboard::TEvSystemStateRequest::EventType)
2099+
));
2100+
2101+
}
2102+
return TTestActorRuntime::EEventAction::DROP;
2103+
}
2104+
break;
2105+
}
2106+
}
2107+
return TTestActorRuntime::EEventAction::PROCESS;
2108+
};
2109+
runtime.SetObserverFunc(observerFunc);
2110+
runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
2111+
2112+
auto result = runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
2113+
UNIT_ASSERT_VALUES_EQUAL(result.self_check_result(), Ydb::Monitoring::SelfCheck::GOOD);
2114+
}
20692115
}
20702116
}

0 commit comments

Comments
 (0)