Skip to content

Commit b3bc6c8

Browse files
healthcheck segfault while retrying Whiteboard (#17836)
1 parent 9289583 commit b3bc6c8

File tree

2 files changed

+62
-8
lines changed

2 files changed

+62
-8
lines changed

ydb/core/health_check/health_check.cpp

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1112,20 +1112,28 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
11121112
auto nodeId = ev->Get()->NodeId;
11131113
switch (eventId) {
11141114
case NNodeWhiteboard::TEvWhiteboard::EvSystemStateRequest:
1115-
NodeSystemState.erase(nodeId);
1116-
NodeSystemState[nodeId] = RequestNodeWhiteboard<NNodeWhiteboard::TEvWhiteboard::TEvSystemStateRequest>(nodeId);
1115+
if (!NodeSystemState[nodeId].IsDone()) {
1116+
NodeSystemState.erase(nodeId);
1117+
NodeSystemState[nodeId] = RequestNodeWhiteboard<NNodeWhiteboard::TEvWhiteboard::TEvSystemStateRequest>(nodeId, {-1});
1118+
}
11171119
break;
11181120
case NNodeWhiteboard::TEvWhiteboard::EvVDiskStateRequest:
1119-
NodeVDiskState.erase(nodeId);
1120-
NodeVDiskState[nodeId] = RequestNodeWhiteboard<NNodeWhiteboard::TEvWhiteboard::TEvVDiskStateRequest>(nodeId);
1121+
if (!NodeVDiskState[nodeId].IsDone()) {
1122+
NodeVDiskState.erase(nodeId);
1123+
NodeVDiskState[nodeId] = RequestNodeWhiteboard<NNodeWhiteboard::TEvWhiteboard::TEvVDiskStateRequest>(nodeId);
1124+
}
11211125
break;
11221126
case NNodeWhiteboard::TEvWhiteboard::EvPDiskStateRequest:
1123-
NodePDiskState.erase(nodeId);
1124-
NodePDiskState[nodeId] = RequestNodeWhiteboard<NNodeWhiteboard::TEvWhiteboard::TEvPDiskStateRequest>(nodeId);
1127+
if (!NodePDiskState[nodeId].IsDone()) {
1128+
NodePDiskState.erase(nodeId);
1129+
NodePDiskState[nodeId] = RequestNodeWhiteboard<NNodeWhiteboard::TEvWhiteboard::TEvPDiskStateRequest>(nodeId);
1130+
}
11251131
break;
11261132
case NNodeWhiteboard::TEvWhiteboard::EvBSGroupStateRequest:
1127-
NodeBSGroupState.erase(nodeId);
1128-
NodeBSGroupState[nodeId] = RequestNodeWhiteboard<NNodeWhiteboard::TEvWhiteboard::TEvBSGroupStateRequest>(nodeId);
1133+
if (!NodeBSGroupState[nodeId].IsDone()) {
1134+
NodeBSGroupState.erase(nodeId);
1135+
NodeBSGroupState[nodeId] = RequestNodeWhiteboard<NNodeWhiteboard::TEvWhiteboard::TEvBSGroupStateRequest>(nodeId);
1136+
}
11291137
break;
11301138
default:
11311139
RequestDone("unsupported event scheduled");

ydb/core/health_check/health_check_ut.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1961,5 +1961,51 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
19611961

19621962
UNIT_ASSERT(HasDeadTabletIssue(result));
19631963
}
1964+
1965+
Y_UNIT_TEST(TestSystemStateRetriesAfterReceivingResponse) {
1966+
TPortManager tp;
1967+
ui16 port = tp.GetPort(2134);
1968+
ui16 grpcPort = tp.GetPort(2135);
1969+
auto settings = TServerSettings(port)
1970+
.SetNodeCount(1)
1971+
.SetDynamicNodeCount(1)
1972+
.SetUseRealThreads(false)
1973+
.SetDomainName("Root");
1974+
TServer server(settings);
1975+
server.EnableGRpc(grpcPort);
1976+
TClient client(settings);
1977+
TTestActorRuntime& runtime = *server.GetRuntime();
1978+
1979+
TActorId sender = runtime.AllocateEdgeActor();
1980+
TAutoPtr<IEventHandle> handle;
1981+
1982+
std::optional<TActorId> targetActor;
1983+
auto observerFunc = [&](TAutoPtr<IEventHandle>& ev) {
1984+
switch (ev->GetTypeRewrite()) {
1985+
case TEvWhiteboard::EvSystemStateResponse: {
1986+
if (ev->Cookie == 1) {
1987+
if (!targetActor) {
1988+
targetActor = ev->Recipient;
1989+
runtime.Send(ev.Release());
1990+
runtime.Send(new IEventHandle(
1991+
*targetActor,
1992+
sender,
1993+
new NHealthCheck::TEvPrivate::TEvRetryNodeWhiteboard(1, TEvWhiteboard::TEvSystemStateRequest::EventType)
1994+
));
1995+
1996+
}
1997+
return TTestActorRuntime::EEventAction::DROP;
1998+
}
1999+
break;
2000+
}
2001+
}
2002+
return TTestActorRuntime::EEventAction::PROCESS;
2003+
};
2004+
runtime.SetObserverFunc(observerFunc);
2005+
runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
2006+
2007+
auto result = runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
2008+
UNIT_ASSERT_VALUES_EQUAL(result.self_check_result(), Ydb::Monitoring::SelfCheck::GOOD);
2009+
}
19642010
}
19652011
}

0 commit comments

Comments
 (0)