Skip to content

Commit 507719c

Browse files
authored
specify whiteboard fields in healthcheck to get bridge info (#23365)
1 parent e15f7e7 commit 507719c

File tree

2 files changed

+129
-5
lines changed

2 files changed

+129
-5
lines changed

ydb/core/health_check/health_check.cpp

Lines changed: 54 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1113,10 +1113,59 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
11131113
}
11141114

11151115
template<typename TEvent>
1116-
[[nodiscard]] TRequestResponse<typename WhiteboardResponse<TEvent>::Type> RequestNodeWhiteboard(TNodeId nodeId, std::initializer_list<int> fields = {}) {
1116+
std::vector<int> GetRequiredFields();
1117+
1118+
template<>
1119+
std::vector<int> GetRequiredFields<TEvWhiteboard::TEvSystemStateRequest>() {
1120+
return {
1121+
NKikimrWhiteboard::TSystemStateInfo::kPoolStatsFieldNumber,
1122+
NKikimrWhiteboard::TSystemStateInfo::kLoadAverageFieldNumber,
1123+
NKikimrWhiteboard::TSystemStateInfo::kNumberOfCpusFieldNumber,
1124+
NKikimrWhiteboard::TSystemStateInfo::kMaxClockSkewPeerIdFieldNumber,
1125+
NKikimrWhiteboard::TSystemStateInfo::kLocationFieldNumber,
1126+
NKikimrWhiteboard::TSystemStateInfo::kMaxClockSkewWithPeerUsFieldNumber,
1127+
};
1128+
}
1129+
1130+
template<>
1131+
std::vector<int> GetRequiredFields<TEvWhiteboard::TEvVDiskStateRequest>() {
1132+
return {
1133+
NKikimrWhiteboard::TVDiskStateInfo::kVDiskIdFieldNumber,
1134+
NKikimrWhiteboard::TVDiskStateInfo::kPDiskIdFieldNumber,
1135+
NKikimrWhiteboard::TVDiskStateInfo::kVDiskStateFieldNumber,
1136+
NKikimrWhiteboard::TVDiskStateInfo::kReplicatedFieldNumber,
1137+
NKikimrWhiteboard::TVDiskStateInfo::kDiskSpaceFieldNumber,
1138+
};
1139+
}
1140+
1141+
template<>
1142+
std::vector<int> GetRequiredFields<TEvWhiteboard::TEvPDiskStateRequest>() {
1143+
return {
1144+
NKikimrWhiteboard::TPDiskStateInfo::kPDiskIdFieldNumber,
1145+
NKikimrWhiteboard::TPDiskStateInfo::kPathFieldNumber,
1146+
NKikimrWhiteboard::TPDiskStateInfo::kAvailableSizeFieldNumber,
1147+
NKikimrWhiteboard::TPDiskStateInfo::kTotalSizeFieldNumber,
1148+
NKikimrWhiteboard::TPDiskStateInfo::kStateFieldNumber,
1149+
};
1150+
}
1151+
1152+
template<>
1153+
std::vector<int> GetRequiredFields<TEvWhiteboard::TEvBSGroupStateRequest>() {
1154+
return {
1155+
NKikimrWhiteboard::TBSGroupStateInfo::kGroupGenerationFieldNumber,
1156+
NKikimrWhiteboard::TBSGroupStateInfo::kGroupIDFieldNumber,
1157+
NKikimrWhiteboard::TBSGroupStateInfo::kStoragePoolNameFieldNumber,
1158+
NKikimrWhiteboard::TBSGroupStateInfo::kBridgePileIdFieldNumber,
1159+
NKikimrWhiteboard::TBSGroupStateInfo::kErasureSpeciesFieldNumber,
1160+
NKikimrWhiteboard::TBSGroupStateInfo::kVDiskIdsFieldNumber,
1161+
};
1162+
}
1163+
1164+
template<typename TEvent>
1165+
[[nodiscard]] TRequestResponse<typename WhiteboardResponse<TEvent>::Type> RequestNodeWhiteboard(TNodeId nodeId) {
11171166
TActorId whiteboardServiceId = MakeNodeWhiteboardServiceId(nodeId);
11181167
auto request = MakeHolder<TEvent>();
1119-
for (int field : fields) {
1168+
for (int field : GetRequiredFields<TEvent>()) {
11201169
request->Record.AddFieldsRequired(field);
11211170
}
11221171
TRequestResponse<typename WhiteboardResponse<TEvent>::Type> response(Span.CreateChild(TComponentTracingLevels::TTablet::Detailed, TypeName(*request.Get())));
@@ -1128,9 +1177,10 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
11281177
return response;
11291178
}
11301179

1180+
11311181
void RequestGenericNode(TNodeId nodeId) {
11321182
if (NodeSystemState.count(nodeId) == 0) {
1133-
NodeSystemState.emplace(nodeId, RequestNodeWhiteboard<TEvWhiteboard::TEvSystemStateRequest>(nodeId, {-1}));
1183+
NodeSystemState.emplace(nodeId, RequestNodeWhiteboard<TEvWhiteboard::TEvSystemStateRequest>(nodeId));
11341184
++Requests;
11351185
}
11361186
}
@@ -1190,7 +1240,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
11901240
case TEvWhiteboard::EvSystemStateRequest: {
11911241
auto& request = NodeSystemState[nodeId];
11921242
if (!request.IsOk()) {
1193-
request = RequestNodeWhiteboard<TEvWhiteboard::TEvSystemStateRequest>(nodeId, {-1});
1243+
request = RequestNodeWhiteboard<TEvWhiteboard::TEvSystemStateRequest>(nodeId);
11941244
}
11951245
break;
11961246
}

ydb/core/health_check/health_check_ut.cpp

Lines changed: 75 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -690,7 +690,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
690690
}
691691
};
692692

693-
void CheckHcResultHasIssuesWithStatus(Ydb::Monitoring::SelfCheckResult& result, const TString& type,
693+
void CheckHcResultHasIssuesWithStatus(const Ydb::Monitoring::SelfCheckResult& result, const TString& type,
694694
const Ydb::Monitoring::StatusFlag::Status expectingStatus, ui32 total,
695695
TLocationFilter locationFilter = {}) {
696696
int issuesCount = 0;
@@ -2192,6 +2192,80 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
21922192
UNIT_ASSERT_VALUES_EQUAL(database_status.storage().pools()[0].id(), "static");
21932193
}
21942194

2195+
Y_UNIT_TEST(BridgeNoBscResponse) {
2196+
TPortManager tp;
2197+
ui16 port = tp.GetPort(2134);
2198+
ui16 grpcPort = tp.GetPort(2135);
2199+
auto settings = TServerSettings(port)
2200+
.SetNodeCount(1)
2201+
.SetDynamicNodeCount(1)
2202+
.SetUseRealThreads(false)
2203+
.SetDomainName("Root");
2204+
TServer server(settings);
2205+
server.EnableGRpc(grpcPort);
2206+
TClient client(settings);
2207+
TTestActorRuntime& runtime = *server.GetRuntime();
2208+
2209+
auto &dynamicNameserviceConfig = runtime.GetAppData().DynamicNameserviceConfig;
2210+
dynamicNameserviceConfig->MaxStaticNodeId = runtime.GetNodeId(server.StaticNodes() - 1);
2211+
dynamicNameserviceConfig->MinDynamicNodeId = runtime.GetNodeId(server.StaticNodes());
2212+
dynamicNameserviceConfig->MaxDynamicNodeId = runtime.GetNodeId(server.StaticNodes() + server.DynamicNodes() - 1);
2213+
2214+
auto bridgeInfo = std::make_shared<TBridgeInfo>();
2215+
bridgeInfo->Piles.push_back(TBridgeInfo::TPile{.BridgePileId = TBridgePileId::FromPileIndex(0), .Name = "1", .State = NKikimrBridge::TClusterState::SYNCHRONIZED});
2216+
bridgeInfo->Piles.push_back(TBridgeInfo::TPile{.BridgePileId = TBridgePileId::FromPileIndex(1), .Name = "2", .State = NKikimrBridge::TClusterState::SYNCHRONIZED});
2217+
bridgeInfo->SelfNodePile = bridgeInfo->Piles.data();
2218+
2219+
auto observerFunc = [&](TAutoPtr<IEventHandle>& ev) {
2220+
auto type = ev->GetTypeRewrite();
2221+
if (EventSpaceBegin(TKikimrEvents::ES_SYSTEM_VIEW) <= type && type <= EventSpaceEnd(TKikimrEvents::ES_SYSTEM_VIEW)) {
2222+
return TTestActorRuntime::EEventAction::DROP;
2223+
}
2224+
if (type == TEvBlobStorage::EvNodeWardenStorageConfig) {
2225+
auto* x = reinterpret_cast<TEvNodeWardenStorageConfig::TPtr*>(&ev);
2226+
(*x)->Get()->BridgeInfo = bridgeInfo;
2227+
}
2228+
if (type == NNodeWhiteboard::TEvWhiteboard::EvBSGroupStateResponse) {
2229+
auto* x = reinterpret_cast<NNodeWhiteboard::TEvWhiteboard::TEvBSGroupStateResponse::TPtr*>(&ev);
2230+
for (auto& group : *(*x)->Get()->Record.mutable_bsgroupstateinfo()) {
2231+
group.set_bridgepileid(1);
2232+
}
2233+
}
2234+
if (type == NNodeWhiteboard::TEvWhiteboard::EvVDiskStateResponse) {
2235+
auto *x = reinterpret_cast<NNodeWhiteboard::TEvWhiteboard::TEvVDiskStateResponse::TPtr*>(&ev);
2236+
(*x)->Get()->Record.mutable_vdiskstateinfo(0)->set_vdiskstate(NKikimrWhiteboard::EVDiskState::SyncGuidRecovery);
2237+
}
2238+
return TTestActorRuntime::EEventAction::PROCESS;
2239+
};
2240+
runtime.SetObserverFunc(observerFunc);
2241+
2242+
TActorId sender = runtime.AllocateEdgeActor();
2243+
TAutoPtr<IEventHandle> handle;
2244+
2245+
auto *request = new NHealthCheck::TEvSelfCheckRequest;
2246+
request->Request.set_return_verbose_status(true);
2247+
request->Database = "/Root";
2248+
runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, request, 0));
2249+
const auto result = runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
2250+
2251+
Ctest << result.ShortDebugString() << Endl;
2252+
2253+
UNIT_ASSERT_VALUES_EQUAL(result.self_check_result(), Ydb::Monitoring::SelfCheck::EMERGENCY);
2254+
2255+
bool bscTabletIssueFoundInResult = false;
2256+
for (const auto &issue_log : result.issue_log()) {
2257+
if (issue_log.level() == 3 && issue_log.type() == "SYSTEM_TABLET") {
2258+
UNIT_ASSERT_VALUES_EQUAL(issue_log.location().compute().tablet().id().size(), 1);
2259+
UNIT_ASSERT_VALUES_EQUAL(issue_log.location().compute().tablet().id()[0], ToString(MakeBSControllerID()));
2260+
UNIT_ASSERT_VALUES_EQUAL(issue_log.location().compute().tablet().type(), "BSController");
2261+
bscTabletIssueFoundInResult = true;
2262+
}
2263+
}
2264+
UNIT_ASSERT(bscTabletIssueFoundInResult);
2265+
2266+
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1, TLocationFilter().Pool("static").Pile("1"));
2267+
}
2268+
21952269
Y_UNIT_TEST(ShardsLimit999) {
21962270
ShardsQuotaTest(999, 1000, 1, Ydb::Monitoring::StatusFlag::RED);
21972271
}

0 commit comments

Comments
 (0)