Skip to content

Commit bd68c36

Browse files
authored
use health check status for cluster status (#27305)
1 parent 23231d0 commit bd68c36

File tree

2 files changed

+104
-27
lines changed

2 files changed

+104
-27
lines changed

ydb/core/viewer/json_handlers_viewer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ void InitViewerStorageUsageJsonHandler(TJsonHandlers &handlers) {
233233
}
234234

235235
void InitViewerClusterJsonHandler(TJsonHandlers& handlers) {
236-
handlers.AddHandler("/viewer/cluster", new TJsonHandler<TJsonCluster>(TJsonCluster::GetSwagger()), 7);
236+
handlers.AddHandler("/viewer/cluster", new TJsonHandler<TJsonCluster>(TJsonCluster::GetSwagger()), 8);
237237
}
238238

239239
void InitViewerLabeledCountersJsonHandler(TJsonHandlers &handlers) {

ydb/core/viewer/viewer_cluster.h

Lines changed: 103 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "viewer.h"
55
#include "viewer_helper.h"
66
#include "viewer_tabletinfo.h"
7+
#include <ydb/library/actors/interconnect/interconnect.h>
78
#include <ydb/public/api/protos/ydb_bridge_common.pb.h>
89

910
namespace NKikimr::NViewer {
@@ -32,6 +33,10 @@ class TJsonCluster : public TViewerPipeClient {
3233
std::unordered_map<TNodeId, TRequestResponse<TEvViewer::TEvViewerResponse>> SystemViewerResponse;
3334
std::unordered_map<TNodeId, TRequestResponse<TEvViewer::TEvViewerResponse>> TabletViewerResponse;
3435

36+
std::optional<TRequestResponse<NHealthCheck::TEvSelfCheckResult>> SelfCheckResult; // from the local health check
37+
std::optional<TRequestResponse<NHealthCheck::TEvSelfCheckResultProto>> SelfCheckResultProto; // from the metadata cache service
38+
TViewerPipeClient::TRequestResponse<TEvStateStorage::TEvBoardInfo> MetadataCacheEndpointsLookup;
39+
3540
struct TNode {
3641
TEvInterconnect::TNodeInfo NodeInfo;
3742
NKikimrWhiteboard::TSystemStateInfo SystemState;
@@ -91,26 +96,27 @@ class TJsonCluster : public TViewerPipeClient {
9196
std::unordered_set<TTabletId> FilterTablets;
9297
bool OffloadMerge = true;
9398
size_t OffloadMergeAttempts = 2;
99+
bool UseHealthCheck = true;
100+
bool UseHealthCheckCache = false; // doesn't work for domain ?
101+
TString DomainName;
94102
TTabletId RootHiveId = 0;
95-
TJsonSettings JsonSettings;
96-
ui32 Timeout;
97103
bool Tablets = false;
98104

99105
public:
100106
TJsonCluster(IViewer* viewer, NMon::TEvHttpInfo::TPtr& ev)
101107
: TViewerPipeClient(viewer, ev)
102108
{
103-
const auto& params(Event->Get()->Request.GetParams());
104-
JsonSettings.EnumAsNumbers = !FromStringWithDefault<bool>(params.Get("enums"), true);
105-
JsonSettings.UI64AsString = !FromStringWithDefault<bool>(params.Get("ui64"), false);
106-
InitConfig(params);
107-
Tablets = FromStringWithDefault<bool>(params.Get("tablets"), false);
108-
Timeout = FromStringWithDefault<ui32>(params.Get("timeout"), 10000);
109-
OffloadMerge = FromStringWithDefault<bool>(params.Get("offload_merge"), OffloadMerge);
110-
OffloadMergeAttempts = FromStringWithDefault<bool>(params.Get("offload_merge_attempts"), OffloadMergeAttempts);
111109
}
112110

113111
void Bootstrap() override {
112+
Tablets = FromStringWithDefault<bool>(Params.Get("tablets"), false);
113+
OffloadMerge = FromStringWithDefault<bool>(Params.Get("offload_merge"), OffloadMerge);
114+
OffloadMergeAttempts = FromStringWithDefault<bool>(Params.Get("offload_merge_attempts"), OffloadMergeAttempts);
115+
UseHealthCheck = FromStringWithDefault<bool>(Params.Get("use_health_check"), UseHealthCheck);
116+
UseHealthCheckCache = FromStringWithDefault<bool>(Params.Get("use_health_check_cache"), UseHealthCheckCache);
117+
if (!UseHealthCheck) {
118+
UseHealthCheckCache = false;
119+
}
114120
NodesInfoResponse = MakeRequest<TEvInterconnect::TEvNodesInfo>(GetNameserviceActorId(), new TEvInterconnect::TEvListNodes());
115121
if (AppData()->BridgeModeEnabled) {
116122
NodeWardenStorageConfigResponse = MakeRequest<TEvNodeWardenStorageConfig>(MakeBlobStorageNodeWardenID(SelfId().NodeId()),
@@ -124,7 +130,8 @@ class TJsonCluster : public TViewerPipeClient {
124130
ListTenantsResponse = MakeRequestConsoleListTenants();
125131
if (AppData()->DomainsInfo && AppData()->DomainsInfo->Domain) {
126132
TIntrusivePtr<TDomainsInfo> domains = AppData()->DomainsInfo;
127-
ClusterInfo.SetDomain(TStringBuilder() << "/" << AppData()->DomainsInfo->Domain->Name);
133+
DomainName = TStringBuilder() << "/" << AppData()->DomainsInfo->Domain->Name;
134+
ClusterInfo.SetDomain(DomainName);
128135
if (const auto& domain = domains->Domain) {
129136
for (TTabletId id : domain->Coordinators) {
130137
FilterTablets.insert(id);
@@ -146,8 +153,16 @@ class TJsonCluster : public TViewerPipeClient {
146153
FilterTablets.insert(MakeNodeBrokerID());
147154
FilterTablets.insert(MakeTenantSlotBrokerID());
148155
FilterTablets.insert(MakeConsoleID());
156+
157+
if (UseHealthCheck) {
158+
if (UseHealthCheckCache && AppData()->FeatureFlags.GetEnableDbMetadataCache()) {
159+
MetadataCacheEndpointsLookup = MakeRequestStateStorageMetadataCacheEndpointsLookup(DomainName);
160+
} else {
161+
SendHealthCheckRequest();
162+
}
163+
}
149164
}
150-
Become(&TThis::StateWork, TDuration::MilliSeconds(Timeout), new TEvents::TEvWakeup());
165+
Become(&TThis::StateWork, Timeout, new TEvents::TEvWakeup());
151166
}
152167

153168
private:
@@ -421,7 +436,7 @@ class TJsonCluster : public TViewerPipeClient {
421436
if (SystemViewerResponse.count(nodeId) == 0) {
422437
auto viewerRequest = std::make_unique<TEvViewer::TEvViewerRequest>();
423438
InitSystemWhiteboardRequest(viewerRequest->Record.MutableSystemRequest());
424-
viewerRequest->Record.SetTimeout(Timeout / 2);
439+
viewerRequest->Record.SetTimeout(Timeout.MilliSeconds() / 2);
425440
for (const TNode* node : batch.NodesToAskAbout) {
426441
viewerRequest->Record.MutableLocation()->AddNodeId(node->NodeId);
427442
}
@@ -432,7 +447,7 @@ class TJsonCluster : public TViewerPipeClient {
432447
if (Tablets && batch.HasStaticNodes && TabletViewerResponse.count(nodeId) == 0) {
433448
auto viewerRequest = std::make_unique<TEvViewer::TEvViewerRequest>();
434449
InitTabletWhiteboardRequest(viewerRequest->Record.MutableTabletRequest());
435-
viewerRequest->Record.SetTimeout(Timeout / 2);
450+
viewerRequest->Record.SetTimeout(Timeout.MilliSeconds() / 2);
436451
for (const TNode* node : batch.NodesToAskAbout) {
437452
if (node->Static) {
438453
viewerRequest->Record.MutableLocation()->AddNodeId(node->NodeId);
@@ -639,6 +654,45 @@ class TJsonCluster : public TViewerPipeClient {
639654
RequestDone();
640655
}
641656

657+
std::unique_ptr<NHealthCheck::TEvSelfCheckRequest> MakeSelfCheckRequest() {
658+
auto request = std::make_unique<NHealthCheck::TEvSelfCheckRequest>();
659+
request->Database = DomainName;
660+
return request;
661+
}
662+
663+
void SendHealthCheckRequest() {
664+
SelfCheckResult = MakeRequest<NHealthCheck::TEvSelfCheckResult>(NHealthCheck::MakeHealthCheckID(), MakeSelfCheckRequest().release());
665+
}
666+
667+
void Handle(NHealthCheck::TEvSelfCheckResult::TPtr& ev) {
668+
if (SelfCheckResult->Set(std::move(ev))) {
669+
RequestDone();
670+
}
671+
}
672+
673+
void Handle(NHealthCheck::TEvSelfCheckResultProto::TPtr& ev) {
674+
if (SelfCheckResultProto->Set(std::move(ev))) {
675+
RequestDone();
676+
}
677+
}
678+
679+
void Handle(TEvStateStorage::TEvBoardInfo::TPtr& ev) {
680+
if (MetadataCacheEndpointsLookup.Set(std::move(ev))) {
681+
if (MetadataCacheEndpointsLookup.IsOk()) {
682+
auto activeNode = TDatabaseMetadataCache::PickActiveNode(MetadataCacheEndpointsLookup->InfoEntries);
683+
if (activeNode != 0) {
684+
TActorId cache = MakeDatabaseMetadataCacheId(activeNode);
685+
auto request = std::make_unique<NHealthCheck::TEvSelfCheckRequestProto>();
686+
SelfCheckResultProto = MakeRequest<NHealthCheck::TEvSelfCheckResultProto>(cache, request.release(), IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession, activeNode);
687+
}
688+
}
689+
if (!SelfCheckResultProto) {
690+
SendHealthCheckRequest();
691+
}
692+
RequestDone();
693+
}
694+
}
695+
642696
void Handle(TEvInterconnect::TEvNodesInfo::TPtr& ev) {
643697
if (NodesInfoResponse->Set(std::move(ev))) {
644698
ProcessResponses();
@@ -903,6 +957,8 @@ class TJsonCluster : public TViewerPipeClient {
903957
hFunc(NSysView::TEvSysView::TEvGetVSlotsResponse, Handle);
904958
hFunc(NSysView::TEvSysView::TEvGetGroupsResponse, Handle);
905959
hFunc(TEvHive::TEvResponseHiveNodeStats, Handle);
960+
hFunc(NHealthCheck::TEvSelfCheckResult, Handle);
961+
hFunc(NHealthCheck::TEvSelfCheckResultProto, Handle);
906962
hFunc(TEvents::TEvUndelivered, Undelivered);
907963
hFunc(TEvInterconnect::TEvNodeDisconnected, Disconnected);
908964
hFunc(TEvTabletPipe::TEvClientConnected, Handle);
@@ -922,6 +978,21 @@ class TJsonCluster : public TViewerPipeClient {
922978
}
923979
}
924980

981+
static NKikimrWhiteboard::EFlag GetClusterStateFromSelfCheck(const Ydb::Monitoring::SelfCheckResult& result) {
982+
switch (result.self_check_result()) {
983+
case Ydb::Monitoring::SelfCheck::GOOD:
984+
return NKikimrWhiteboard::EFlag::Green;
985+
case Ydb::Monitoring::SelfCheck::DEGRADED:
986+
return NKikimrWhiteboard::EFlag::Yellow;
987+
case Ydb::Monitoring::SelfCheck::MAINTENANCE_REQUIRED:
988+
return NKikimrWhiteboard::EFlag::Red;
989+
case Ydb::Monitoring::SelfCheck::EMERGENCY:
990+
return NKikimrWhiteboard::EFlag::Red;
991+
default:
992+
return NKikimrWhiteboard::EFlag::Grey;
993+
}
994+
}
995+
925996
void ReplyAndPassAway() override {
926997
if (StorageStatsResponse && StorageStatsResponse->IsOk()) {
927998
for (NKikimrSysView::TStorageStatsEntry& entry : *StorageStatsResponse->Get()->Record.MutableEntries()) {
@@ -1018,20 +1089,26 @@ class TJsonCluster : public TViewerPipeClient {
10181089
if (CachedDataMaxAge) {
10191090
ClusterInfo.SetCachedDataMaxAge(CachedDataMaxAge.MilliSeconds());
10201091
}
1021-
NKikimrWhiteboard::EFlag worstState = NKikimrWhiteboard::EFlag::Grey;
1022-
ui64 worstNodes = 0;
1023-
for (NKikimrWhiteboard::EFlag flag = NKikimrWhiteboard::EFlag::Grey; flag <= NKikimrWhiteboard::EFlag::Red; flag = NKikimrWhiteboard::EFlag(flag + 1)) {
1024-
auto itNodes = ClusterInfo.GetMapNodeStates().find(NKikimrWhiteboard::EFlag_Name(flag));
1025-
if (itNodes == ClusterInfo.GetMapNodeStates().end()) {
1026-
continue;
1027-
}
1028-
auto& nodes = itNodes->second;
1029-
if (nodes > worstNodes / 100) { // only if it's more than 1% of all nodes
1030-
worstState = flag;
1092+
NKikimrWhiteboard::EFlag clusterState = NKikimrWhiteboard::EFlag::Grey;
1093+
if (SelfCheckResult && SelfCheckResult->IsOk()) {
1094+
clusterState = GetClusterStateFromSelfCheck(SelfCheckResult->Get()->Result);
1095+
} else if (SelfCheckResultProto && SelfCheckResultProto->IsOk()) {
1096+
clusterState = GetClusterStateFromSelfCheck(SelfCheckResultProto->Get()->Record);
1097+
} else {
1098+
ui64 worstNodes = 0;
1099+
for (NKikimrWhiteboard::EFlag flag = NKikimrWhiteboard::EFlag::Grey; flag <= NKikimrWhiteboard::EFlag::Red; flag = NKikimrWhiteboard::EFlag(flag + 1)) {
1100+
auto itNodes = ClusterInfo.GetMapNodeStates().find(NKikimrWhiteboard::EFlag_Name(flag));
1101+
if (itNodes == ClusterInfo.GetMapNodeStates().end()) {
1102+
continue;
1103+
}
1104+
auto& nodes = itNodes->second;
1105+
if (nodes > worstNodes / 100) { // only if it's more than 1% of all nodes
1106+
clusterState = flag;
1107+
}
1108+
worstNodes += nodes;
10311109
}
1032-
worstNodes += nodes;
10331110
}
1034-
ClusterInfo.SetOverall(GetViewerFlag(worstState));
1111+
ClusterInfo.SetOverall(GetViewerFlag(clusterState));
10351112
TBase::ReplyAndPassAway(GetHTTPOKJSON(ClusterInfo));
10361113
}
10371114

0 commit comments

Comments
 (0)